Merge pull request #468 from shikorism/feature/per-host-resolve-control
リモートホストごとの同時アクセス制御とメタデータ取得ポリシー制御
This commit is contained in:
commit
301fc83e7e
24
app/ContentProvider.php
Normal file
24
app/ContentProvider.php
Normal file
@ -0,0 +1,24 @@
|
||||
<?php
|
||||
|
||||
namespace App;
|
||||
|
||||
use Illuminate\Database\Eloquent\Model;
|
||||
|
||||
class ContentProvider extends Model
|
||||
{
|
||||
public $incrementing = false;
|
||||
protected $primaryKey = 'host';
|
||||
protected $keyType = 'string';
|
||||
|
||||
protected $fillable = [
|
||||
'host',
|
||||
'robots',
|
||||
'robots_cached_at',
|
||||
];
|
||||
|
||||
protected $dates = [
|
||||
'created_at',
|
||||
'updated_at',
|
||||
'robots_cached_at',
|
||||
];
|
||||
}
|
30
app/MetadataResolver/DisallowedByProviderException.php
Normal file
30
app/MetadataResolver/DisallowedByProviderException.php
Normal file
@ -0,0 +1,30 @@
|
||||
<?php
|
||||
|
||||
namespace App\MetadataResolver;
|
||||
|
||||
use RuntimeException;
|
||||
use Throwable;
|
||||
|
||||
/**
|
||||
* ContentProviderの提供するrobots.txtによってクロールが拒否された場合にスローされます。
|
||||
*/
|
||||
class DisallowedByProviderException extends RuntimeException
|
||||
{
|
||||
private $url;
|
||||
|
||||
public function __construct(string $url, Throwable $previous = null)
|
||||
{
|
||||
parent::__construct("Access denied by robots.txt: $url", 0, $previous);
|
||||
$this->url = $url;
|
||||
}
|
||||
|
||||
public function getUrl(): string
|
||||
{
|
||||
return $this->url;
|
||||
}
|
||||
|
||||
public function getHost(): string
|
||||
{
|
||||
return parse_url($this->url, PHP_URL_HOST);
|
||||
}
|
||||
}
|
@ -2,14 +2,20 @@
|
||||
|
||||
namespace App\Services;
|
||||
|
||||
use App\ContentProvider;
|
||||
use App\Metadata;
|
||||
use App\MetadataResolver\DeniedHostException;
|
||||
use App\MetadataResolver\DisallowedByProviderException;
|
||||
use App\MetadataResolver\MetadataResolver;
|
||||
use App\MetadataResolver\ResolverCircuitBreakException;
|
||||
use App\MetadataResolver\UncaughtResolverException;
|
||||
use App\Tag;
|
||||
use App\Utilities\Formatter;
|
||||
use Carbon\Carbon;
|
||||
use Carbon\CarbonInterface;
|
||||
use GuzzleHttp\Client;
|
||||
use Illuminate\Support\Facades\DB;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
|
||||
class MetadataResolveService
|
||||
{
|
||||
@ -44,13 +50,208 @@ class MetadataResolveService
|
||||
throw new DeniedHostException($url);
|
||||
}
|
||||
|
||||
DB::beginTransaction();
|
||||
try {
|
||||
$metadata = Metadata::find($url);
|
||||
|
||||
// 無かったら取得
|
||||
// TODO: ある程度古かったら再取得とかありだと思う
|
||||
if ($metadata == null || $metadata->needRefresh()) {
|
||||
$hostWithPort = $this->getHostWithPortFromUrl($url);
|
||||
$metadata = $this->hostLock($hostWithPort, function (?CarbonInterface $lastAccess) use ($url) {
|
||||
// HostLockの解放待ちをしている間に、他のプロセスで取得完了しているかもしれない
|
||||
$metadata = Metadata::find($url);
|
||||
if ($metadata !== null && !$metadata->needRefresh()) {
|
||||
return $metadata;
|
||||
}
|
||||
|
||||
$this->checkProviderPolicy($url, $lastAccess);
|
||||
|
||||
return $this->resolve($url, $metadata);
|
||||
});
|
||||
}
|
||||
|
||||
return $metadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* URLからホスト部とポート部を抽出
|
||||
* @param string $url
|
||||
* @return string
|
||||
*/
|
||||
private function getHostWithPortFromUrl(string $url): string
|
||||
{
|
||||
$parts = parse_url($url);
|
||||
$host = $parts['host'];
|
||||
if (isset($parts['port'])) {
|
||||
$host .= ':' . $parts['port'];
|
||||
}
|
||||
|
||||
return $host;
|
||||
}
|
||||
|
||||
/**
|
||||
* アクセス先ホスト単位の排他ロックを取って処理を実行
|
||||
* @param string $host
|
||||
* @param callable $fn
|
||||
* @return mixed return of $fn
|
||||
* @throws \RuntimeException いろいろな死に方をする
|
||||
*/
|
||||
private function hostLock(string $host, callable $fn)
|
||||
{
|
||||
$lockDir = storage_path('content_providers_lock');
|
||||
if (!file_exists($lockDir)) {
|
||||
if (!mkdir($lockDir)) {
|
||||
throw new \RuntimeException("Lock failed! Can't create lock directory.");
|
||||
}
|
||||
}
|
||||
|
||||
$lockFile = $lockDir . DIRECTORY_SEPARATOR . $host;
|
||||
$fp = fopen($lockFile, 'c+b');
|
||||
if ($fp === false) {
|
||||
throw new \RuntimeException("Lock failed! Can't open lock file.");
|
||||
}
|
||||
|
||||
try {
|
||||
if (!flock($fp, LOCK_EX)) {
|
||||
throw new \RuntimeException("Lock failed! Can't lock file.");
|
||||
}
|
||||
|
||||
try {
|
||||
$accessInfoText = stream_get_contents($fp);
|
||||
if ($accessInfoText !== false) {
|
||||
$accessInfo = json_decode($accessInfoText, true);
|
||||
}
|
||||
|
||||
$result = $fn(isset($accessInfo['time']) ? new Carbon($accessInfo['time']) : null);
|
||||
|
||||
$accessInfo = [
|
||||
'time' => now()->toIso8601String()
|
||||
];
|
||||
fseek($fp, 0);
|
||||
if (fwrite($fp, json_encode($accessInfo)) === false) {
|
||||
throw new \RuntimeException("I/O Error! Can't write to lock file.");
|
||||
}
|
||||
|
||||
return $result;
|
||||
} finally {
|
||||
if (!flock($fp, LOCK_UN)) {
|
||||
throw new \RuntimeException("Unlock failed! Can't unlock file.");
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (!fclose($fp)) {
|
||||
throw new \RuntimeException("Unlock failed! Can't close lock file.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 指定したメタデータURLのホストが持つrobots.txtをダウンロードします。
|
||||
* @param string $url メタデータのURL
|
||||
* @return string
|
||||
*/
|
||||
private function fetchRobotsTxt(string $url): ?string
|
||||
{
|
||||
$parts = parse_url($url);
|
||||
$robotsUrl = http_build_url([
|
||||
'scheme' => $parts['scheme'],
|
||||
'host' => $parts['host'],
|
||||
'port' => $parts['port'] ?? null,
|
||||
'path' => '/robots.txt'
|
||||
]);
|
||||
|
||||
$client = app(Client::class);
|
||||
try {
|
||||
$res = $client->get($robotsUrl);
|
||||
|
||||
return (string) $res->getBody();
|
||||
} catch (\Exception $e) {
|
||||
Log::error("robots.txtの取得に失敗: {$e}");
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ContentProviderポリシー情報との照合を行い、アクセス可能かチェックします。アクセスできない場合は例外をスローします。
|
||||
* @param string $url メタデータを取得したいURL
|
||||
* @param CarbonInterface|null $lastAccess アクセス先ホストへの最終アクセス日時 (記録がある場合)
|
||||
* @throws DeniedHostException アクセス先がTissue内のブラックリストに入っている場合にスロー
|
||||
* @throws DisallowedByProviderException アクセス先のrobots.txtによって拒否されている場合にスロー
|
||||
*/
|
||||
private function checkProviderPolicy(string $url, ?CarbonInterface $lastAccess): void
|
||||
{
|
||||
DB::beginTransaction();
|
||||
try {
|
||||
$hostWithPort = $this->getHostWithPortFromUrl($url);
|
||||
$contentProvider = ContentProvider::sharedLock()->find($hostWithPort);
|
||||
if ($contentProvider === null) {
|
||||
$contentProvider = ContentProvider::create([
|
||||
'host' => $hostWithPort,
|
||||
'robots' => $this->fetchRobotsTxt($url),
|
||||
'robots_cached_at' => now(),
|
||||
]);
|
||||
}
|
||||
|
||||
if ($contentProvider->is_blocked) {
|
||||
throw new DeniedHostException($url);
|
||||
}
|
||||
|
||||
// 連続アクセス制限
|
||||
if ($lastAccess !== null) {
|
||||
$elapsedSeconds = $lastAccess->diffInSeconds(now(), false);
|
||||
if ($elapsedSeconds < $contentProvider->access_interval_sec) {
|
||||
if ($elapsedSeconds < 0) {
|
||||
$wait = abs($elapsedSeconds) + $contentProvider->access_interval_sec;
|
||||
} else {
|
||||
$wait = $contentProvider->access_interval_sec - $elapsedSeconds;
|
||||
}
|
||||
sleep($wait);
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch robots.txt
|
||||
if ($contentProvider->robots_cached_at->diffInDays(now()) >= 7) {
|
||||
$contentProvider->update([
|
||||
'robots' => $this->fetchRobotsTxt($url),
|
||||
'robots_cached_at' => now(),
|
||||
]);
|
||||
}
|
||||
|
||||
// Check robots.txt
|
||||
$robotsParser = new \RobotsTxtParser($contentProvider->robots);
|
||||
$robotsParser->setUserAgent('TissueBot');
|
||||
$robotsDelay = $robotsParser->getDelay();
|
||||
if ($robotsDelay !== 0 && $robotsDelay >= $contentProvider->access_interval_sec) {
|
||||
$contentProvider->access_interval_sec = (int) $robotsDelay;
|
||||
$contentProvider->save();
|
||||
}
|
||||
if ($robotsParser->isDisallowed(parse_url($url, PHP_URL_PATH))) {
|
||||
throw new DisallowedByProviderException($url);
|
||||
}
|
||||
|
||||
DB::commit();
|
||||
} catch (DeniedHostException | DisallowedByProviderException $e) {
|
||||
// ContentProviderのデータ更新は行うため
|
||||
DB::commit();
|
||||
throw $e;
|
||||
} catch (\Exception $e) {
|
||||
DB::rollBack();
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* メタデータをリモートサーバに問い合わせて取得します。
|
||||
* @param string $url メタデータを取得したいURL
|
||||
* @param Metadata|null $metadata キャッシュ済のメタデータ (存在する場合)
|
||||
* @return Metadata 取得できたメタデータ
|
||||
* @throws UncaughtResolverException Resolver内で例外が発生して取得できなかった場合にスロー
|
||||
* @throws ResolverCircuitBreakException 規定回数以上の解決失敗により、メタデータの取得が不能となっている場合にスロー
|
||||
*/
|
||||
private function resolve(string $url, ?Metadata $metadata): Metadata
|
||||
{
|
||||
DB::beginTransaction();
|
||||
try {
|
||||
if ($metadata === null) {
|
||||
$metadata = new Metadata(['url' => $url]);
|
||||
}
|
||||
@ -84,7 +285,6 @@ class MetadataResolveService
|
||||
$tagIds[] = $tag->id;
|
||||
}
|
||||
$metadata->tags()->sync($tagIds);
|
||||
}
|
||||
|
||||
DB::commit();
|
||||
|
||||
|
@ -33,7 +33,8 @@
|
||||
"sentry/sentry-laravel": "1.8.0",
|
||||
"staudenmeir/eloquent-eager-limit": "^1.0",
|
||||
"symfony/css-selector": "^4.3",
|
||||
"symfony/dom-crawler": "^4.3"
|
||||
"symfony/dom-crawler": "^4.3",
|
||||
"t1gor/robots-txt-parser": "^0.2.4"
|
||||
},
|
||||
"require-dev": {
|
||||
"barryvdh/laravel-debugbar": "^3.1",
|
||||
|
145
composer.lock
generated
145
composer.lock
generated
@ -4,7 +4,7 @@
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "1bba68b609be6a0dcdaf05d72e8eb759",
|
||||
"content-hash": "bbb184ff943ae3a938a8370d94b6afb2",
|
||||
"packages": [
|
||||
{
|
||||
"name": "anhskohbo/no-captcha",
|
||||
@ -547,20 +547,6 @@
|
||||
"uppercase",
|
||||
"words"
|
||||
],
|
||||
"funding": [
|
||||
{
|
||||
"url": "https://www.doctrine-project.org/sponsorship.html",
|
||||
"type": "custom"
|
||||
},
|
||||
{
|
||||
"url": "https://www.patreon.com/phpdoctrine",
|
||||
"type": "patreon"
|
||||
},
|
||||
{
|
||||
"url": "https://tidelift.com/funding/github/packagist/doctrine%2Finflector",
|
||||
"type": "tidelift"
|
||||
}
|
||||
],
|
||||
"time": "2020-05-29T15:13:26+00:00"
|
||||
},
|
||||
{
|
||||
@ -623,20 +609,6 @@
|
||||
"parser",
|
||||
"php"
|
||||
],
|
||||
"funding": [
|
||||
{
|
||||
"url": "https://www.doctrine-project.org/sponsorship.html",
|
||||
"type": "custom"
|
||||
},
|
||||
{
|
||||
"url": "https://www.patreon.com/phpdoctrine",
|
||||
"type": "patreon"
|
||||
},
|
||||
{
|
||||
"url": "https://tidelift.com/funding/github/packagist/doctrine%2Flexer",
|
||||
"type": "tidelift"
|
||||
}
|
||||
],
|
||||
"time": "2020-05-25T17:44:05+00:00"
|
||||
},
|
||||
{
|
||||
@ -5649,6 +5621,63 @@
|
||||
],
|
||||
"time": "2020-05-30T20:06:45+00:00"
|
||||
},
|
||||
{
|
||||
"name": "t1gor/robots-txt-parser",
|
||||
"version": "v0.2.4",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/t1gor/Robots.txt-Parser-Class.git",
|
||||
"reference": "7ff08da5625fb4f72d17b1528c60aadb184e9e68"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/t1gor/Robots.txt-Parser-Class/zipball/7ff08da5625fb4f72d17b1528c60aadb184e9e68",
|
||||
"reference": "7ff08da5625fb4f72d17b1528c60aadb184e9e68",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-mbstring": "*",
|
||||
"php": ">=5.5.0",
|
||||
"vipnytt/useragentparser": "^1.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"codeclimate/php-test-reporter": ">=0.2",
|
||||
"phpunit/phpunit": "~3.7"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"classmap": [
|
||||
"source/robotstxtparser.php"
|
||||
]
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Igor Timoshenkov",
|
||||
"email": "igor.timoshenkov@gmail.com",
|
||||
"role": "creator"
|
||||
},
|
||||
{
|
||||
"name": "Jan-Petter Gundersen",
|
||||
"email": "jpg@vipnytt.no",
|
||||
"role": "contributor"
|
||||
}
|
||||
],
|
||||
"description": "PHP class to parse robots.txt rules according to Google, Yandex, W3C and The Web Robots Pages specifications.",
|
||||
"homepage": "https://github.com/t1gor/Robots.txt-Parser-Class",
|
||||
"keywords": [
|
||||
"The Web Robots Pages",
|
||||
"W3C",
|
||||
"google",
|
||||
"parser",
|
||||
"robots.txt",
|
||||
"yandex"
|
||||
],
|
||||
"time": "2018-07-21T20:01:19+00:00"
|
||||
},
|
||||
{
|
||||
"name": "tijsverkoyen/css-to-inline-styles",
|
||||
"version": "2.2.3",
|
||||
@ -5698,6 +5727,64 @@
|
||||
"homepage": "https://github.com/tijsverkoyen/CssToInlineStyles",
|
||||
"time": "2020-07-13T06:12:54+00:00"
|
||||
},
|
||||
{
|
||||
"name": "vipnytt/useragentparser",
|
||||
"version": "v1.0.4",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/VIPnytt/UserAgentParser.git",
|
||||
"reference": "c5a6718a57088e0d45c2e36f09efabc4e008bd8c"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/VIPnytt/UserAgentParser/zipball/c5a6718a57088e0d45c2e36f09efabc4e008bd8c",
|
||||
"reference": "c5a6718a57088e0d45c2e36f09efabc4e008bd8c",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": "^5.5 || ^7.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.5"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"vipnytt\\": "src/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "VIP nytt AS",
|
||||
"email": "support@vipnytt.no",
|
||||
"role": "Owner"
|
||||
},
|
||||
{
|
||||
"name": "Jan-Petter Gundersen",
|
||||
"email": "jpg@vipnytt.no",
|
||||
"role": "Developer"
|
||||
}
|
||||
],
|
||||
"description": "User-Agent parser for robot rule sets",
|
||||
"homepage": "https://github.com/VIPnytt/UserAgentParser",
|
||||
"keywords": [
|
||||
"REP",
|
||||
"Robots Exclusion Protocol",
|
||||
"Robots meta tag",
|
||||
"crawler",
|
||||
"robot",
|
||||
"robots.txt",
|
||||
"spider",
|
||||
"user-agent",
|
||||
"useragent",
|
||||
"x-robots-tag"
|
||||
],
|
||||
"time": "2017-12-17T14:23:27+00:00"
|
||||
},
|
||||
{
|
||||
"name": "vlucas/phpdotenv",
|
||||
"version": "v3.6.7",
|
||||
|
14
database/factories/ContentProviderFactory.php
Normal file
14
database/factories/ContentProviderFactory.php
Normal file
@ -0,0 +1,14 @@
|
||||
<?php
|
||||
|
||||
/** @var \Illuminate\Database\Eloquent\Factory $factory */
|
||||
|
||||
use App\ContentProvider;
|
||||
use Faker\Generator as Faker;
|
||||
|
||||
$factory->define(ContentProvider::class, function (Faker $faker) {
|
||||
return [
|
||||
'host' => 'example.com',
|
||||
'robots' => null,
|
||||
'robots_cached_at' => now(),
|
||||
];
|
||||
});
|
@ -0,0 +1,37 @@
|
||||
<?php
|
||||
|
||||
use Illuminate\Database\Migrations\Migration;
|
||||
use Illuminate\Database\Schema\Blueprint;
|
||||
use Illuminate\Support\Facades\Schema;
|
||||
|
||||
class CreateContentProvidersTable extends Migration
|
||||
{
|
||||
/**
|
||||
* Run the migrations.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function up()
|
||||
{
|
||||
Schema::create('content_providers', function (Blueprint $table) {
|
||||
$table->string('host');
|
||||
$table->text('robots')->nullable();
|
||||
$table->timestamp('robots_cached_at');
|
||||
$table->boolean('is_blocked')->default(false);
|
||||
$table->integer('access_interval_sec')->default(5);
|
||||
$table->timestamps();
|
||||
|
||||
$table->primary('host');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse the migrations.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function down()
|
||||
{
|
||||
Schema::dropIfExists('content_providers');
|
||||
}
|
||||
}
|
2
storage/content_providers_lock/.gitignore
vendored
Normal file
2
storage/content_providers_lock/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
@ -2,6 +2,7 @@
|
||||
|
||||
namespace Tests\Unit\Services;
|
||||
|
||||
use App\ContentProvider;
|
||||
use App\MetadataResolver\MetadataResolver;
|
||||
use App\MetadataResolver\ResolverCircuitBreakException;
|
||||
use App\MetadataResolver\UncaughtResolverException;
|
||||
@ -26,6 +27,8 @@ class MetadataResolverServiceTest extends TestCase
|
||||
parent::setUp();
|
||||
$this->seed();
|
||||
Carbon::setTestNow('2020-07-21 19:19:19');
|
||||
// FIXME: 今書かれてるテストはresolveのHTTPリクエストのみを考慮しているので、ContentProviderにデータがないとリクエスト回数がずれる
|
||||
factory(ContentProvider::class)->create();
|
||||
}
|
||||
|
||||
protected function tearDown(): void
|
||||
|
Loading…
Reference in New Issue
Block a user