リモートホストごとの同時アクセス制御とメタデータ取得ポリシー制御

This commit is contained in:
shibafu 2020-08-11 00:17:10 +09:00
parent f715e7feee
commit b71b7e5cb2
5 changed files with 387 additions and 67 deletions

View File

@ -0,0 +1,30 @@
<?php
namespace App\MetadataResolver;
use RuntimeException;
use Throwable;
/**
* ContentProviderの提供するrobots.txtによってクロールが拒否された場合にスローされます。
*/
class DisallowedByProviderException extends RuntimeException
{
private $url;
public function __construct(string $url, Throwable $previous = null)
{
parent::__construct("Access denied by robots.txt: $url", 0, $previous);
$this->url = $url;
}
public function getUrl(): string
{
return $this->url;
}
public function getHost(): string
{
return parse_url($this->url, PHP_URL_HOST);
}
}

View File

@ -2,14 +2,20 @@
namespace App\Services;
use App\ContentProvider;
use App\Metadata;
use App\MetadataResolver\DeniedHostException;
use App\MetadataResolver\DisallowedByProviderException;
use App\MetadataResolver\MetadataResolver;
use App\MetadataResolver\ResolverCircuitBreakException;
use App\MetadataResolver\UncaughtResolverException;
use App\Tag;
use App\Utilities\Formatter;
use Carbon\Carbon;
use Carbon\CarbonInterface;
use GuzzleHttp\Client;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Log;
class MetadataResolveService
{
@ -44,48 +50,242 @@ class MetadataResolveService
throw new DeniedHostException($url);
}
$metadata = Metadata::find($url);
// 無かったら取得
// TODO: ある程度古かったら再取得とかありだと思う
if ($metadata == null || $metadata->needRefresh()) {
$hostWithPort = $this->getHostWithPortFromUrl($url);
$metadata = $this->hostLock($hostWithPort, function (?CarbonInterface $lastAccess) use ($url) {
// HostLockの解放待ちをしている間に、他のプロセスで取得完了しているかもしれない
$metadata = Metadata::find($url);
if ($metadata !== null && !$metadata->needRefresh()) {
return $metadata;
}
$this->checkProviderPolicy($url, $lastAccess);
return $this->resolve($url, $metadata);
});
}
return $metadata;
}
/**
* URLからホスト部とポート部を抽出
* @param string $url
* @return string
*/
private function getHostWithPortFromUrl(string $url): string
{
$parts = parse_url($url);
$host = $parts['host'];
if (isset($parts['port'])) {
$host .= ':' . $parts['port'];
}
return $host;
}
/**
* アクセス先ホスト単位の排他ロックを取って処理を実行
* @param string $host
* @param callable $fn
* @return mixed return of $fn
* @throws \RuntimeException いろいろな死に方をする
*/
private function hostLock(string $host, callable $fn)
{
$lockDir = storage_path('content_providers_lock');
if (!file_exists($lockDir)) {
if (!mkdir($lockDir)) {
throw new \RuntimeException("Lock failed! Can't create lock directory.");
}
}
$lockFile = $lockDir . DIRECTORY_SEPARATOR . $host;
$fp = fopen($lockFile, 'c+b');
if ($fp === false) {
throw new \RuntimeException("Lock failed! Can't open lock file.");
}
try {
if (!flock($fp, LOCK_EX)) {
throw new \RuntimeException("Lock failed! Can't lock file.");
}
try {
$accessInfoText = stream_get_contents($fp);
if ($accessInfoText !== false) {
$accessInfo = json_decode($accessInfoText, true);
}
$result = $fn(isset($accessInfo['time']) ? new Carbon($accessInfo['time']) : null);
$accessInfo = [
'time' => now()->toIso8601String()
];
fseek($fp, 0);
if (fwrite($fp, json_encode($accessInfo)) === false) {
throw new \RuntimeException("I/O Error! Can't write to lock file.");
}
return $result;
} finally {
if (!flock($fp, LOCK_UN)) {
throw new \RuntimeException("Unlock failed! Can't unlock file.");
}
}
} finally {
if (!fclose($fp)) {
throw new \RuntimeException("Unlock failed! Can't close lock file.");
}
}
}
/**
* 指定したメタデータURLのホストが持つrobots.txtをダウンロードします。
* @param string $url メタデータのURL
* @return string
*/
private function fetchRobotsTxt(string $url): ?string
{
$parts = parse_url($url);
$robotsUrl = http_build_url([
'scheme' => $parts['scheme'],
'host' => $parts['host'],
'port' => $parts['port'] ?? null,
'path' => '/robots.txt'
]);
$client = app(Client::class);
try {
$res = $client->get($robotsUrl);
return (string) $res->getBody();
} catch (\Exception $e) {
Log::error("robots.txtの取得に失敗: {$e}");
return null;
}
}
/**
* ContentProviderポリシー情報との照合を行い、アクセス可能かチェックします。アクセスできない場合は例外をスローします。
* @param string $url メタデータを取得したいURL
* @param CarbonInterface|null $lastAccess アクセス先ホストへの最終アクセス日時 (記録がある場合)
* @throws DeniedHostException アクセス先がTissue内のブラックリストに入っている場合にスロー
* @throws DisallowedByProviderException アクセス先のrobots.txtによって拒否されている場合にスロー
*/
private function checkProviderPolicy(string $url, ?CarbonInterface $lastAccess): void
{
DB::beginTransaction();
try {
$metadata = Metadata::find($url);
// 無かったら取得
// TODO: ある程度古かったら再取得とかありだと思う
if ($metadata == null || $metadata->needRefresh()) {
if ($metadata === null) {
$metadata = new Metadata(['url' => $url]);
}
if ($metadata->error_count >= self::CIRCUIT_BREAK_COUNT) {
throw new ResolverCircuitBreakException($metadata->error_count, $url);
}
try {
$resolved = $this->resolver->resolve($url);
} catch (\Exception $e) {
$metadata->storeException(now(), $e);
$metadata->save();
throw new UncaughtResolverException(implode(': ', [
$metadata->error_count . '回目のメタデータ取得失敗', get_class($e), $e->getMessage()
]), 0, $e);
}
$metadata->fill([
'title' => $resolved->title,
'description' => $resolved->description,
'image' => $resolved->image,
'expires_at' => $resolved->expires_at
$hostWithPort = $this->getHostWithPortFromUrl($url);
$contentProvider = ContentProvider::sharedLock()->find($hostWithPort);
if ($contentProvider === null) {
$contentProvider = ContentProvider::create([
'host' => $hostWithPort,
'robots' => $this->fetchRobotsTxt($url),
'robots_cached_at' => now(),
]);
$metadata->clearError();
$metadata->save();
$tagIds = [];
foreach ($resolved->normalizedTags() as $tagName) {
$tag = Tag::firstOrCreate(['name' => $tagName]);
$tagIds[] = $tag->id;
}
$metadata->tags()->sync($tagIds);
}
if ($contentProvider->is_blocked) {
throw new DeniedHostException($url);
}
// 連続アクセス制限
if ($lastAccess !== null) {
$elapsedSeconds = $lastAccess->diffInSeconds(now(), false);
if ($elapsedSeconds < $contentProvider->access_interval_sec) {
if ($elapsedSeconds < 0) {
$wait = abs($elapsedSeconds) + $contentProvider->access_interval_sec;
} else {
$wait = $contentProvider->access_interval_sec - $elapsedSeconds;
}
sleep($wait);
}
}
// Fetch robots.txt
if ($contentProvider->robots_cached_at->diffInDays(now()) >= 7) {
$contentProvider->update([
'robots' => $this->fetchRobotsTxt($url),
'robots_cached_at' => now(),
]);
}
// Check robots.txt
$robotsParser = new \RobotsTxtParser($contentProvider->robots);
$robotsParser->setUserAgent('TissueBot');
$robotsDelay = $robotsParser->getDelay();
if ($robotsDelay !== 0 && $robotsDelay >= $contentProvider->access_interval_sec) {
$contentProvider->access_interval_sec = (int) $robotsDelay;
$contentProvider->save();
}
if ($robotsParser->isDisallowed(parse_url($url, PHP_URL_PATH))) {
throw new DisallowedByProviderException($url);
}
DB::commit();
} catch (DeniedHostException | DisallowedByProviderException $e) {
// ContentProviderのデータ更新は行うため
DB::commit();
throw $e;
} catch (\Exception $e) {
DB::rollBack();
throw $e;
}
}
/**
* メタデータをリモートサーバに問い合わせて取得します。
* @param string $url メタデータを取得したいURL
* @param Metadata|null $metadata キャッシュ済のメタデータ (存在する場合)
* @return Metadata 取得できたメタデータ
* @throws UncaughtResolverException Resolver内で例外が発生して取得できなかった場合にスロー
* @throws ResolverCircuitBreakException 規定回数以上の解決失敗により、メタデータの取得が不能となっている場合にスロー
*/
private function resolve(string $url, ?Metadata $metadata): Metadata
{
DB::beginTransaction();
try {
if ($metadata === null) {
$metadata = new Metadata(['url' => $url]);
}
if ($metadata->error_count >= self::CIRCUIT_BREAK_COUNT) {
throw new ResolverCircuitBreakException($metadata->error_count, $url);
}
try {
$resolved = $this->resolver->resolve($url);
} catch (\Exception $e) {
$metadata->storeException(now(), $e);
$metadata->save();
throw new UncaughtResolverException(implode(': ', [
$metadata->error_count . '回目のメタデータ取得失敗', get_class($e), $e->getMessage()
]), 0, $e);
}
$metadata->fill([
'title' => $resolved->title,
'description' => $resolved->description,
'image' => $resolved->image,
'expires_at' => $resolved->expires_at
]);
$metadata->clearError();
$metadata->save();
$tagIds = [];
foreach ($resolved->normalizedTags() as $tagName) {
$tag = Tag::firstOrCreate(['name' => $tagName]);
$tagIds[] = $tag->id;
}
$metadata->tags()->sync($tagIds);
DB::commit();
return $metadata;

View File

@ -33,7 +33,8 @@
"sentry/sentry-laravel": "1.8.0",
"staudenmeir/eloquent-eager-limit": "^1.0",
"symfony/css-selector": "^4.3",
"symfony/dom-crawler": "^4.3"
"symfony/dom-crawler": "^4.3",
"t1gor/robots-txt-parser": "^0.2.4"
},
"require-dev": {
"barryvdh/laravel-debugbar": "^3.1",

145
composer.lock generated
View File

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "1bba68b609be6a0dcdaf05d72e8eb759",
"content-hash": "bbb184ff943ae3a938a8370d94b6afb2",
"packages": [
{
"name": "anhskohbo/no-captcha",
@ -547,20 +547,6 @@
"uppercase",
"words"
],
"funding": [
{
"url": "https://www.doctrine-project.org/sponsorship.html",
"type": "custom"
},
{
"url": "https://www.patreon.com/phpdoctrine",
"type": "patreon"
},
{
"url": "https://tidelift.com/funding/github/packagist/doctrine%2Finflector",
"type": "tidelift"
}
],
"time": "2020-05-29T15:13:26+00:00"
},
{
@ -623,20 +609,6 @@
"parser",
"php"
],
"funding": [
{
"url": "https://www.doctrine-project.org/sponsorship.html",
"type": "custom"
},
{
"url": "https://www.patreon.com/phpdoctrine",
"type": "patreon"
},
{
"url": "https://tidelift.com/funding/github/packagist/doctrine%2Flexer",
"type": "tidelift"
}
],
"time": "2020-05-25T17:44:05+00:00"
},
{
@ -5649,6 +5621,63 @@
],
"time": "2020-05-30T20:06:45+00:00"
},
{
"name": "t1gor/robots-txt-parser",
"version": "v0.2.4",
"source": {
"type": "git",
"url": "https://github.com/t1gor/Robots.txt-Parser-Class.git",
"reference": "7ff08da5625fb4f72d17b1528c60aadb184e9e68"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/t1gor/Robots.txt-Parser-Class/zipball/7ff08da5625fb4f72d17b1528c60aadb184e9e68",
"reference": "7ff08da5625fb4f72d17b1528c60aadb184e9e68",
"shasum": ""
},
"require": {
"ext-mbstring": "*",
"php": ">=5.5.0",
"vipnytt/useragentparser": "^1.0"
},
"require-dev": {
"codeclimate/php-test-reporter": ">=0.2",
"phpunit/phpunit": "~3.7"
},
"type": "library",
"autoload": {
"classmap": [
"source/robotstxtparser.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Igor Timoshenkov",
"email": "igor.timoshenkov@gmail.com",
"role": "creator"
},
{
"name": "Jan-Petter Gundersen",
"email": "jpg@vipnytt.no",
"role": "contributor"
}
],
"description": "PHP class to parse robots.txt rules according to Google, Yandex, W3C and The Web Robots Pages specifications.",
"homepage": "https://github.com/t1gor/Robots.txt-Parser-Class",
"keywords": [
"The Web Robots Pages",
"W3C",
"google",
"parser",
"robots.txt",
"yandex"
],
"time": "2018-07-21T20:01:19+00:00"
},
{
"name": "tijsverkoyen/css-to-inline-styles",
"version": "2.2.3",
@ -5698,6 +5727,64 @@
"homepage": "https://github.com/tijsverkoyen/CssToInlineStyles",
"time": "2020-07-13T06:12:54+00:00"
},
{
"name": "vipnytt/useragentparser",
"version": "v1.0.4",
"source": {
"type": "git",
"url": "https://github.com/VIPnytt/UserAgentParser.git",
"reference": "c5a6718a57088e0d45c2e36f09efabc4e008bd8c"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/VIPnytt/UserAgentParser/zipball/c5a6718a57088e0d45c2e36f09efabc4e008bd8c",
"reference": "c5a6718a57088e0d45c2e36f09efabc4e008bd8c",
"shasum": ""
},
"require": {
"php": "^5.5 || ^7.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8.35 || ^5.7 || ^6.5"
},
"type": "library",
"autoload": {
"psr-4": {
"vipnytt\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "VIP nytt AS",
"email": "support@vipnytt.no",
"role": "Owner"
},
{
"name": "Jan-Petter Gundersen",
"email": "jpg@vipnytt.no",
"role": "Developer"
}
],
"description": "User-Agent parser for robot rule sets",
"homepage": "https://github.com/VIPnytt/UserAgentParser",
"keywords": [
"REP",
"Robots Exclusion Protocol",
"Robots meta tag",
"crawler",
"robot",
"robots.txt",
"spider",
"user-agent",
"useragent",
"x-robots-tag"
],
"time": "2017-12-17T14:23:27+00:00"
},
{
"name": "vlucas/phpdotenv",
"version": "v3.6.7",

View File

@ -0,0 +1,2 @@
*
!.gitignore