diff --git a/app/MetadataResolver/DisallowedByProviderException.php b/app/MetadataResolver/DisallowedByProviderException.php new file mode 100644 index 0000000..41d42f1 --- /dev/null +++ b/app/MetadataResolver/DisallowedByProviderException.php @@ -0,0 +1,30 @@ +url = $url; + } + + public function getUrl(): string + { + return $this->url; + } + + public function getHost(): string + { + return parse_url($this->url, PHP_URL_HOST); + } +} diff --git a/app/Services/MetadataResolveService.php b/app/Services/MetadataResolveService.php index fec23d6..dec8479 100644 --- a/app/Services/MetadataResolveService.php +++ b/app/Services/MetadataResolveService.php @@ -2,14 +2,20 @@ namespace App\Services; +use App\ContentProvider; use App\Metadata; use App\MetadataResolver\DeniedHostException; +use App\MetadataResolver\DisallowedByProviderException; use App\MetadataResolver\MetadataResolver; use App\MetadataResolver\ResolverCircuitBreakException; use App\MetadataResolver\UncaughtResolverException; use App\Tag; use App\Utilities\Formatter; +use Carbon\Carbon; +use Carbon\CarbonInterface; +use GuzzleHttp\Client; use Illuminate\Support\Facades\DB; +use Illuminate\Support\Facades\Log; class MetadataResolveService { @@ -44,48 +50,242 @@ class MetadataResolveService throw new DeniedHostException($url); } + $metadata = Metadata::find($url); + + // 無かったら取得 + // TODO: ある程度古かったら再取得とかありだと思う + if ($metadata == null || $metadata->needRefresh()) { + $hostWithPort = $this->getHostWithPortFromUrl($url); + $metadata = $this->hostLock($hostWithPort, function (?CarbonInterface $lastAccess) use ($url) { + // HostLockの解放待ちをしている間に、他のプロセスで取得完了しているかもしれない + $metadata = Metadata::find($url); + if ($metadata !== null && !$metadata->needRefresh()) { + return $metadata; + } + + $this->checkProviderPolicy($url, $lastAccess); + + return $this->resolve($url, $metadata); + }); + } + + return $metadata; + } + + /** + * URLからホスト部とポート部を抽出 + * @param string $url + * @return string + */ + private function getHostWithPortFromUrl(string $url): string + { + $parts = parse_url($url); + $host = $parts['host']; + if (isset($parts['port'])) { + $host .= ':' . $parts['port']; + } + + return $host; + } + + /** + * アクセス先ホスト単位の排他ロックを取って処理を実行 + * @param string $host + * @param callable $fn + * @return mixed return of $fn + * @throws \RuntimeException いろいろな死に方をする + */ + private function hostLock(string $host, callable $fn) + { + $lockDir = storage_path('content_providers_lock'); + if (!file_exists($lockDir)) { + if (!mkdir($lockDir)) { + throw new \RuntimeException("Lock failed! Can't create lock directory."); + } + } + + $lockFile = $lockDir . DIRECTORY_SEPARATOR . $host; + $fp = fopen($lockFile, 'c+b'); + if ($fp === false) { + throw new \RuntimeException("Lock failed! Can't open lock file."); + } + + try { + if (!flock($fp, LOCK_EX)) { + throw new \RuntimeException("Lock failed! Can't lock file."); + } + + try { + $accessInfoText = stream_get_contents($fp); + if ($accessInfoText !== false) { + $accessInfo = json_decode($accessInfoText, true); + } + + $result = $fn(isset($accessInfo['time']) ? new Carbon($accessInfo['time']) : null); + + $accessInfo = [ + 'time' => now()->toIso8601String() + ]; + fseek($fp, 0); + if (fwrite($fp, json_encode($accessInfo)) === false) { + throw new \RuntimeException("I/O Error! Can't write to lock file."); + } + + return $result; + } finally { + if (!flock($fp, LOCK_UN)) { + throw new \RuntimeException("Unlock failed! Can't unlock file."); + } + } + } finally { + if (!fclose($fp)) { + throw new \RuntimeException("Unlock failed! Can't close lock file."); + } + } + } + + /** + * 指定したメタデータURLのホストが持つrobots.txtをダウンロードします。 + * @param string $url メタデータのURL + * @return string + */ + private function fetchRobotsTxt(string $url): ?string + { + $parts = parse_url($url); + $robotsUrl = http_build_url([ + 'scheme' => $parts['scheme'], + 'host' => $parts['host'], + 'port' => $parts['port'] ?? null, + 'path' => '/robots.txt' + ]); + + $client = app(Client::class); + try { + $res = $client->get($robotsUrl); + + return (string) $res->getBody(); + } catch (\Exception $e) { + Log::error("robots.txtの取得に失敗: {$e}"); + + return null; + } + } + + /** + * ContentProviderポリシー情報との照合を行い、アクセス可能かチェックします。アクセスできない場合は例外をスローします。 + * @param string $url メタデータを取得したいURL + * @param CarbonInterface|null $lastAccess アクセス先ホストへの最終アクセス日時 (記録がある場合) + * @throws DeniedHostException アクセス先がTissue内のブラックリストに入っている場合にスロー + * @throws DisallowedByProviderException アクセス先のrobots.txtによって拒否されている場合にスロー + */ + private function checkProviderPolicy(string $url, ?CarbonInterface $lastAccess): void + { DB::beginTransaction(); try { - $metadata = Metadata::find($url); - - // 無かったら取得 - // TODO: ある程度古かったら再取得とかありだと思う - if ($metadata == null || $metadata->needRefresh()) { - if ($metadata === null) { - $metadata = new Metadata(['url' => $url]); - } - - if ($metadata->error_count >= self::CIRCUIT_BREAK_COUNT) { - throw new ResolverCircuitBreakException($metadata->error_count, $url); - } - - try { - $resolved = $this->resolver->resolve($url); - } catch (\Exception $e) { - $metadata->storeException(now(), $e); - $metadata->save(); - throw new UncaughtResolverException(implode(': ', [ - $metadata->error_count . '回目のメタデータ取得失敗', get_class($e), $e->getMessage() - ]), 0, $e); - } - - $metadata->fill([ - 'title' => $resolved->title, - 'description' => $resolved->description, - 'image' => $resolved->image, - 'expires_at' => $resolved->expires_at + $hostWithPort = $this->getHostWithPortFromUrl($url); + $contentProvider = ContentProvider::sharedLock()->find($hostWithPort); + if ($contentProvider === null) { + $contentProvider = ContentProvider::create([ + 'host' => $hostWithPort, + 'robots' => $this->fetchRobotsTxt($url), + 'robots_cached_at' => now(), ]); - $metadata->clearError(); - $metadata->save(); - - $tagIds = []; - foreach ($resolved->normalizedTags() as $tagName) { - $tag = Tag::firstOrCreate(['name' => $tagName]); - $tagIds[] = $tag->id; - } - $metadata->tags()->sync($tagIds); } + if ($contentProvider->is_blocked) { + throw new DeniedHostException($url); + } + + // 連続アクセス制限 + if ($lastAccess !== null) { + $elapsedSeconds = $lastAccess->diffInSeconds(now(), false); + if ($elapsedSeconds < $contentProvider->access_interval_sec) { + if ($elapsedSeconds < 0) { + $wait = abs($elapsedSeconds) + $contentProvider->access_interval_sec; + } else { + $wait = $contentProvider->access_interval_sec - $elapsedSeconds; + } + sleep($wait); + } + } + + // Fetch robots.txt + if ($contentProvider->robots_cached_at->diffInDays(now()) >= 7) { + $contentProvider->update([ + 'robots' => $this->fetchRobotsTxt($url), + 'robots_cached_at' => now(), + ]); + } + + // Check robots.txt + $robotsParser = new \RobotsTxtParser($contentProvider->robots); + $robotsParser->setUserAgent('TissueBot'); + $robotsDelay = $robotsParser->getDelay(); + if ($robotsDelay !== 0 && $robotsDelay >= $contentProvider->access_interval_sec) { + $contentProvider->access_interval_sec = (int) $robotsDelay; + $contentProvider->save(); + } + if ($robotsParser->isDisallowed(parse_url($url, PHP_URL_PATH))) { + throw new DisallowedByProviderException($url); + } + + DB::commit(); + } catch (DeniedHostException | DisallowedByProviderException $e) { + // ContentProviderのデータ更新は行うため + DB::commit(); + throw $e; + } catch (\Exception $e) { + DB::rollBack(); + throw $e; + } + } + + /** + * メタデータをリモートサーバに問い合わせて取得します。 + * @param string $url メタデータを取得したいURL + * @param Metadata|null $metadata キャッシュ済のメタデータ (存在する場合) + * @return Metadata 取得できたメタデータ + * @throws UncaughtResolverException Resolver内で例外が発生して取得できなかった場合にスロー + * @throws ResolverCircuitBreakException 規定回数以上の解決失敗により、メタデータの取得が不能となっている場合にスロー + */ + private function resolve(string $url, ?Metadata $metadata): Metadata + { + DB::beginTransaction(); + try { + if ($metadata === null) { + $metadata = new Metadata(['url' => $url]); + } + + if ($metadata->error_count >= self::CIRCUIT_BREAK_COUNT) { + throw new ResolverCircuitBreakException($metadata->error_count, $url); + } + + try { + $resolved = $this->resolver->resolve($url); + } catch (\Exception $e) { + $metadata->storeException(now(), $e); + $metadata->save(); + throw new UncaughtResolverException(implode(': ', [ + $metadata->error_count . '回目のメタデータ取得失敗', get_class($e), $e->getMessage() + ]), 0, $e); + } + + $metadata->fill([ + 'title' => $resolved->title, + 'description' => $resolved->description, + 'image' => $resolved->image, + 'expires_at' => $resolved->expires_at + ]); + $metadata->clearError(); + $metadata->save(); + + $tagIds = []; + foreach ($resolved->normalizedTags() as $tagName) { + $tag = Tag::firstOrCreate(['name' => $tagName]); + $tagIds[] = $tag->id; + } + $metadata->tags()->sync($tagIds); + DB::commit(); return $metadata; diff --git a/composer.json b/composer.json index f5a796c..5f21fc8 100644 --- a/composer.json +++ b/composer.json @@ -33,7 +33,8 @@ "sentry/sentry-laravel": "1.8.0", "staudenmeir/eloquent-eager-limit": "^1.0", "symfony/css-selector": "^4.3", - "symfony/dom-crawler": "^4.3" + "symfony/dom-crawler": "^4.3", + "t1gor/robots-txt-parser": "^0.2.4" }, "require-dev": { "barryvdh/laravel-debugbar": "^3.1", diff --git a/composer.lock b/composer.lock index a0f42c5..770daeb 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "1bba68b609be6a0dcdaf05d72e8eb759", + "content-hash": "bbb184ff943ae3a938a8370d94b6afb2", "packages": [ { "name": "anhskohbo/no-captcha", @@ -547,20 +547,6 @@ "uppercase", "words" ], - "funding": [ - { - "url": "https://www.doctrine-project.org/sponsorship.html", - "type": "custom" - }, - { - "url": "https://www.patreon.com/phpdoctrine", - "type": "patreon" - }, - { - "url": "https://tidelift.com/funding/github/packagist/doctrine%2Finflector", - "type": "tidelift" - } - ], "time": "2020-05-29T15:13:26+00:00" }, { @@ -623,20 +609,6 @@ "parser", "php" ], - "funding": [ - { - "url": "https://www.doctrine-project.org/sponsorship.html", - "type": "custom" - }, - { - "url": "https://www.patreon.com/phpdoctrine", - "type": "patreon" - }, - { - "url": "https://tidelift.com/funding/github/packagist/doctrine%2Flexer", - "type": "tidelift" - } - ], "time": "2020-05-25T17:44:05+00:00" }, { @@ -5649,6 +5621,63 @@ ], "time": "2020-05-30T20:06:45+00:00" }, + { + "name": "t1gor/robots-txt-parser", + "version": "v0.2.4", + "source": { + "type": "git", + "url": "https://github.com/t1gor/Robots.txt-Parser-Class.git", + "reference": "7ff08da5625fb4f72d17b1528c60aadb184e9e68" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/t1gor/Robots.txt-Parser-Class/zipball/7ff08da5625fb4f72d17b1528c60aadb184e9e68", + "reference": "7ff08da5625fb4f72d17b1528c60aadb184e9e68", + "shasum": "" + }, + "require": { + "ext-mbstring": "*", + "php": ">=5.5.0", + "vipnytt/useragentparser": "^1.0" + }, + "require-dev": { + "codeclimate/php-test-reporter": ">=0.2", + "phpunit/phpunit": "~3.7" + }, + "type": "library", + "autoload": { + "classmap": [ + "source/robotstxtparser.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Igor Timoshenkov", + "email": "igor.timoshenkov@gmail.com", + "role": "creator" + }, + { + "name": "Jan-Petter Gundersen", + "email": "jpg@vipnytt.no", + "role": "contributor" + } + ], + "description": "PHP class to parse robots.txt rules according to Google, Yandex, W3C and The Web Robots Pages specifications.", + "homepage": "https://github.com/t1gor/Robots.txt-Parser-Class", + "keywords": [ + "The Web Robots Pages", + "W3C", + "google", + "parser", + "robots.txt", + "yandex" + ], + "time": "2018-07-21T20:01:19+00:00" + }, { "name": "tijsverkoyen/css-to-inline-styles", "version": "2.2.3", @@ -5698,6 +5727,64 @@ "homepage": "https://github.com/tijsverkoyen/CssToInlineStyles", "time": "2020-07-13T06:12:54+00:00" }, + { + "name": "vipnytt/useragentparser", + "version": "v1.0.4", + "source": { + "type": "git", + "url": "https://github.com/VIPnytt/UserAgentParser.git", + "reference": "c5a6718a57088e0d45c2e36f09efabc4e008bd8c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/VIPnytt/UserAgentParser/zipball/c5a6718a57088e0d45c2e36f09efabc4e008bd8c", + "reference": "c5a6718a57088e0d45c2e36f09efabc4e008bd8c", + "shasum": "" + }, + "require": { + "php": "^5.5 || ^7.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.5" + }, + "type": "library", + "autoload": { + "psr-4": { + "vipnytt\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "VIP nytt AS", + "email": "support@vipnytt.no", + "role": "Owner" + }, + { + "name": "Jan-Petter Gundersen", + "email": "jpg@vipnytt.no", + "role": "Developer" + } + ], + "description": "User-Agent parser for robot rule sets", + "homepage": "https://github.com/VIPnytt/UserAgentParser", + "keywords": [ + "REP", + "Robots Exclusion Protocol", + "Robots meta tag", + "crawler", + "robot", + "robots.txt", + "spider", + "user-agent", + "useragent", + "x-robots-tag" + ], + "time": "2017-12-17T14:23:27+00:00" + }, { "name": "vlucas/phpdotenv", "version": "v3.6.7", diff --git a/storage/content_providers_lock/.gitignore b/storage/content_providers_lock/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/storage/content_providers_lock/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore