Skip to content

Commit e8de7cd

Browse files
author
Christoph Singer
committed
Protect closing tags in html strings within scripts
1 parent 40c7643 commit e8de7cd

File tree

3 files changed

+37
-23
lines changed

3 files changed

+37
-23
lines changed

Tests/HtmlPageTest.php

+23
Original file line numberDiff line numberDiff line change
@@ -344,4 +344,27 @@ public function testSaveOnFileName()
344344
$hp->save(vfsStream::url('root/save.html'));
345345
$this->assertFileExists(vfsStream::url('root/save.html'));
346346
}
347+
348+
public function testEmbeddedScriptWithHtml()
349+
{
350+
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
351+
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
352+
$html = <<<END
353+
<!DOCTYPE html>
354+
<html lang="de">
355+
<head>
356+
<title>test</title>
357+
</head>
358+
<body>
359+
<div>
360+
<script>
361+
var html = '<b>Status</b><div>' + it_status_text + '</div>';
362+
</script>
363+
</div>
364+
</body>
365+
</html>
366+
END;
367+
$hp = new HtmlPage($html);
368+
$this->assertEquals($html . "\n", $hp->save());
369+
}
347370
}

src/Helpers.php

+13-3
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,15 @@ public static function cssArrayToString($array)
7474
*/
7575
public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
7676
{
77-
$unsafeLibXml = \LIBXML_VERSION < 20900;
77+
7878
$html = '<html><body>' . $html . '</body></html>';
79+
$d = self::loadHtml($html, $charset);
80+
return $d->getElementsByTagName('body')->item(0);
81+
}
82+
83+
public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument
84+
{
85+
$unsafeLibXml = \LIBXML_VERSION < 20900;
7986
$current = libxml_use_internal_errors(true);
8087
if($unsafeLibXml) {
8188
$disableEntities = libxml_disable_entity_loader(true);
@@ -89,11 +96,14 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
8996
) {
9097
$html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset);
9198
}
92-
@$d->loadHTML($html);
99+
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
100+
// Option LIBXML_SCHEMA_CREATE seems to prevent this
101+
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
102+
@$d->loadHTML($html, \LIBXML_SCHEMA_CREATE);
93103
libxml_use_internal_errors($current);
94104
if($unsafeLibXml) {
95105
libxml_disable_entity_loader($disableEntities);
96106
}
97-
return $d->getElementsByTagName('body')->item(0);
107+
return $d;
98108
}
99109
}

src/HtmlPage.php

+1-20
Original file line numberDiff line numberDiff line change
@@ -41,31 +41,12 @@ class HtmlPage
4141

4242
public function __construct($content = '', $url = '', $charset = 'UTF-8')
4343
{
44-
$unsafeLibXml = \LIBXML_VERSION < 20900;
4544
$this->charset = $charset;
4645
$this->url = $url;
4746
if ($content == '') {
4847
$content = '<!DOCTYPE html><html><head><title></title></head><body></body></html>';
4948
}
50-
$current = libxml_use_internal_errors(true);
51-
if($unsafeLibXml) {
52-
$disableEntities = libxml_disable_entity_loader(true);
53-
}
54-
55-
$this->dom = new \DOMDocument('1.0', $charset);
56-
$this->dom->validateOnParse = true;
57-
58-
59-
if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) {
60-
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
61-
}
62-
63-
@$this->dom->loadHTML($content);
64-
65-
libxml_use_internal_errors($current);
66-
if($unsafeLibXml) {
67-
libxml_disable_entity_loader($disableEntities);
68-
}
49+
$this->dom = Helpers::loadHtml($content, $charset);
6950
$this->crawler = new HtmlPageCrawler($this->dom);
7051
}
7152

0 commit comments

Comments
 (0)