From 8772facadfdb991883e1882549c8d7a68b0f5ab1 Mon Sep 17 00:00:00 2001 From: shibafu Date: Sun, 22 Sep 2019 01:44:01 +0900 Subject: [PATCH 1/2] =?UTF-8?q?JSON=E3=83=87=E3=82=B3=E3=83=BC=E3=83=89?= =?UTF-8?q?=E5=BE=8C=E3=81=ABHTML=E3=82=A8=E3=83=B3=E3=83=86=E3=82=A3?= =?UTF-8?q?=E3=83=86=E3=82=A3=E3=81=AE=E3=83=87=E3=82=B3=E3=83=BC=E3=83=89?= =?UTF-8?q?=E3=82=92=E8=A1=8C=E3=81=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSONデコード前にHTMLエンティティのデコードを行ってしまうと " なども解除されてしまい、JSONとして不正な入力になる。 html_decode_entityのオプションで除外しても良いが、改行以外は一応JSONとして正当はなずなので、安全に倒してJSONとしてのデコードを済ませてから処理する。 --- app/MetadataResolver/NijieResolver.php | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/app/MetadataResolver/NijieResolver.php b/app/MetadataResolver/NijieResolver.php index 8ce9315..4bbe5ca 100644 --- a/app/MetadataResolver/NijieResolver.php +++ b/app/MetadataResolver/NijieResolver.php @@ -36,12 +36,16 @@ class NijieResolver implements Resolver $metadata = $this->ogpResolver->parse($html); $crawler = new Crawler($html); - // DomCrawler内でjson内の日本語がHTMLエンティティに変換されるのでhtml_entity_decode - $json = html_entity_decode($crawler->filter('script[type="application/ld+json"]')->first()->text()); + $json = $crawler->filter('script[type="application/ld+json"]')->first()->text(); // 改行がそのまま入っていることがあるのでデコード前にエスケープが必要 $data = json_decode(preg_replace('/\r?\n/', '\n', $json), true); + // DomCrawler内でjson内の日本語がHTMLエンティティに変換されるので、全要素に対してhtml_entity_decode + array_walk_recursive($data, function (&$v) { + $v = html_entity_decode($v); + }); + $metadata->title = $data['name']; $metadata->description = '投稿者: ' . $data['author']['name'] . PHP_EOL . $data['description']; if ( From b3c98613e7b23c6aeddef9bdf179f1da614442b7 Mon Sep 17 00:00:00 2001 From: shibafu Date: Thu, 3 Oct 2019 23:34:57 +0900 Subject: [PATCH 2/2] =?UTF-8?q?=E3=83=86=E3=82=B9=E3=83=88=E3=81=AE?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../MetadataResolver/NijieResolverTest.php | 19 +++ .../testHasHtmlInAuthorProfileResponse.html | 114 ++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 tests/fixture/Nijie/testHasHtmlInAuthorProfileResponse.html diff --git a/tests/Unit/MetadataResolver/NijieResolverTest.php b/tests/Unit/MetadataResolver/NijieResolverTest.php index 417a853..f0e8d32 100644 --- a/tests/Unit/MetadataResolver/NijieResolverTest.php +++ b/tests/Unit/MetadataResolver/NijieResolverTest.php @@ -129,4 +129,23 @@ class NijieResolverTest extends TestCase $this->assertSame('https://nijie.info/view.php?id=66384', (string) $this->handler->getLastRequest()->getUri()); } } + + public function testHasHtmlInAuthorProfile() + { + $responseText = file_get_contents(__DIR__ . '/../../fixture/Nijie/testHasHtmlInAuthorProfileResponse.html'); + + $this->createResolver(NijieResolver::class, $responseText); + + $metadata = $this->resolver->resolve('https://nijie.info/view.php?id=285698'); + $this->assertSame('JK文化祭コスプレ喫茶', $metadata->title); + $this->assertSame('投稿者: ままままま' . PHP_EOL . + 'https://www.pixiv.net/fanbox/creator/32045169' . PHP_EOL . + 'ピクシブのファンボックスでこっちに上げてた一次創作のノリでえっちなやつ描いてます' . PHP_EOL . + '二次創作のえっちなやつは相変わらずこっち' . PHP_EOL . '健全目なのはついったー', $metadata->description); + $this->assertSame('https://pic.nijie.net/02/nijie_picture/540086_20181028112046_0.png', $metadata->image); + $this->assertSame(['バニーガール'], $metadata->tags); + if ($this->shouldUseMock()) { + $this->assertSame('https://nijie.info/view.php?id=285698', (string) $this->handler->getLastRequest()->getUri()); + } + } } diff --git a/tests/fixture/Nijie/testHasHtmlInAuthorProfileResponse.html b/tests/fixture/Nijie/testHasHtmlInAuthorProfileResponse.html new file mode 100644 index 0000000..2cd5313 --- /dev/null +++ b/tests/fixture/Nijie/testHasHtmlInAuthorProfileResponse.html @@ -0,0 +1,114 @@ +JK文化祭コスプレ喫茶 | ままままま | ニジエ +

JK文化祭コスプレ喫茶 | ままままま

+ +
+ + + + +
+