PixivResolver: meta[name=keywords] からタグっぽい情報を抽出して保存

This commit is contained in:
shibafu 2019-04-29 16:06:40 +09:00
parent 085afd3318
commit c1330ff6e3

View File

@ -49,6 +49,43 @@ class PixivResolver implements Resolver
return str_replace('i.pximg.net', 'i.pixiv.cat', $pixivUrl); return str_replace('i.pximg.net', 'i.pixiv.cat', $pixivUrl);
} }
/**
* HTMLからタグとして利用可能な情報を抽出する
* @param string $html ページ HTML
* @return string[] タグ
*/
public function extractTags(string $html): array
{
$dom = new \DOMDocument();
@$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$xpath = new \DOMXPath($dom);
$nodes = $xpath->query("//meta[@name='keywords']");
if ($nodes->length === 0) {
return [];
}
$keywords = $nodes->item(0)->getAttribute('content');
$tags = [];
foreach (mb_split(',', $keywords) as $keyword) {
$keyword = trim($keyword);
if (empty($keyword)) {
continue;
}
// 一部の固定キーワードは無視
if (array_search($keyword, ['R-18', 'イラスト', 'pixiv', 'ピクシブ'], true)) {
continue;
}
$tags[] = preg_replace('/\s/', '_', $keyword);
}
return $tags;
}
public function resolve(string $url): Metadata public function resolve(string $url): Metadata
{ {
parse_str(parse_url($url, PHP_URL_QUERY), $params); parse_str(parse_url($url, PHP_URL_QUERY), $params);
@ -78,6 +115,8 @@ class PixivResolver implements Resolver
$metadata->image = $this->proxize($illustUrl); $metadata->image = $this->proxize($illustUrl);
$metadata->tags = $this->extractTags($res->getBody());
return $metadata; return $metadata;
} else { } else {
throw new \RuntimeException("{$res->getStatusCode()}: $url"); throw new \RuntimeException("{$res->getStatusCode()}: $url");