Skip to content

Commit 49db1c7

Browse files
committed
use html parsing method from Symfony 5.4
should improve compatibility with PHP 8.3 because it no longer uses deprecated mb_convert_encoding
1 parent 5147809 commit 49db1c7

File tree

2 files changed

+52
-19
lines changed

2 files changed

+52
-19
lines changed

.github/workflows/tests.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414

1515
strategy:
1616
matrix:
17-
php: [7.4, 8.0, 8.1, 8.2]
17+
php: [7.4, 8.0, 8.1, 8.2, 8.3]
1818
dependency-version: [prefer-lowest, prefer-stable]
1919

2020
steps:

src/Helpers.php

+51-18
Original file line numberDiff line numberDiff line change
@@ -82,28 +82,61 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
8282

8383
public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument
8484
{
85-
$unsafeLibXml = \LIBXML_VERSION < 20900;
86-
$current = libxml_use_internal_errors(true);
87-
if($unsafeLibXml) {
85+
return self::parseXhtml($html, $charset);
86+
}
87+
/**
88+
* Function originally taken from Symfony\Component\DomCrawler\Crawler
89+
* (c) Fabien Potencier <fabien@symfony.com>
90+
* License: MIT
91+
*/
92+
private static function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
93+
{
94+
$htmlContent = self::convertToHtmlEntities($htmlContent, $charset);
95+
96+
$internalErrors = libxml_use_internal_errors(true);
97+
if (\LIBXML_VERSION < 20900) {
8898
$disableEntities = libxml_disable_entity_loader(true);
8999
}
90-
$d = new \DOMDocument('1.0', $charset);
91-
$d->validateOnParse = true;
92-
if (function_exists('mb_convert_encoding') && in_array(
93-
strtolower($charset),
94-
array_map('strtolower', mb_list_encodings())
95-
)
96-
) {
97-
$html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset);
100+
101+
$dom = new \DOMDocument('1.0', $charset);
102+
$dom->validateOnParse = true;
103+
104+
if ('' !== trim($htmlContent)) {
105+
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
106+
// Option LIBXML_SCHEMA_CREATE seems to prevent this
107+
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
108+
@$dom->loadHTML($htmlContent, \LIBXML_SCHEMA_CREATE);
98109
}
99-
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
100-
// Option LIBXML_SCHEMA_CREATE seems to prevent this
101-
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
102-
@$d->loadHTML($html, \LIBXML_SCHEMA_CREATE);
103-
libxml_use_internal_errors($current);
104-
if($unsafeLibXml) {
110+
111+
libxml_use_internal_errors($internalErrors);
112+
if (\LIBXML_VERSION < 20900) {
105113
libxml_disable_entity_loader($disableEntities);
106114
}
107-
return $d;
115+
116+
return $dom;
117+
}
118+
119+
/**
120+
* Converts charset to HTML-entities to ensure valid parsing.
121+
* Function taken from Symfony\Component\DomCrawler\Crawler
122+
* (c) Fabien Potencier <fabien@symfony.com>
123+
* License: MIT
124+
*/
125+
private static function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
126+
{
127+
set_error_handler(function () { throw new \Exception(); });
128+
129+
try {
130+
return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset);
131+
} catch (\Exception|\ValueError $e) {
132+
try {
133+
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
134+
$htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
135+
} catch (\Exception|\ValueError $e) {
136+
}
137+
return $htmlContent;
138+
} finally {
139+
restore_error_handler();
140+
}
108141
}
109142
}

0 commit comments

Comments
 (0)