@@ -82,28 +82,61 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
82
82
83
83
public static function loadHtml (string $ html , $ charset = 'UTF-8 ' ): \DOMDocument
84
84
{
85
- $ unsafeLibXml = \LIBXML_VERSION < 20900 ;
86
- $ current = libxml_use_internal_errors (true );
87
- if ($ unsafeLibXml ) {
85
+ return self ::parseXhtml ($ html , $ charset );
86
+ }
87
+ /**
88
+ * Function originally taken from Symfony\Component\DomCrawler\Crawler
89
+ * (c) Fabien Potencier <fabien@symfony.com>
90
+ * License: MIT
91
+ */
92
+ private static function parseXhtml (string $ htmlContent , string $ charset = 'UTF-8 ' ): \DOMDocument
93
+ {
94
+ $ htmlContent = self ::convertToHtmlEntities ($ htmlContent , $ charset );
95
+
96
+ $ internalErrors = libxml_use_internal_errors (true );
97
+ if (\LIBXML_VERSION < 20900 ) {
88
98
$ disableEntities = libxml_disable_entity_loader (true );
89
99
}
90
- $ d = new \DOMDocument ('1.0 ' , $ charset );
91
- $ d ->validateOnParse = true ;
92
- if (function_exists ('mb_convert_encoding ' ) && in_array (
93
- strtolower ($ charset ),
94
- array_map ('strtolower ' , mb_list_encodings ())
95
- )
96
- ) {
97
- $ html = mb_convert_encoding ($ html , 'HTML-ENTITIES ' , $ charset );
100
+
101
+ $ dom = new \DOMDocument ('1.0 ' , $ charset );
102
+ $ dom ->validateOnParse = true ;
103
+
104
+ if ('' !== trim ($ htmlContent )) {
105
+ // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
106
+ // Option LIBXML_SCHEMA_CREATE seems to prevent this
107
+ // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
108
+ @$ dom ->loadHTML ($ htmlContent , \LIBXML_SCHEMA_CREATE );
98
109
}
99
- // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
100
- // Option LIBXML_SCHEMA_CREATE seems to prevent this
101
- // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
102
- @$ d ->loadHTML ($ html , \LIBXML_SCHEMA_CREATE );
103
- libxml_use_internal_errors ($ current );
104
- if ($ unsafeLibXml ) {
110
+
111
+ libxml_use_internal_errors ($ internalErrors );
112
+ if (\LIBXML_VERSION < 20900 ) {
105
113
libxml_disable_entity_loader ($ disableEntities );
106
114
}
107
- return $ d ;
115
+
116
+ return $ dom ;
117
+ }
118
+
119
+ /**
120
+ * Converts charset to HTML-entities to ensure valid parsing.
121
+ * Function taken from Symfony\Component\DomCrawler\Crawler
122
+ * (c) Fabien Potencier <fabien@symfony.com>
123
+ * License: MIT
124
+ */
125
+ private static function convertToHtmlEntities (string $ htmlContent , string $ charset = 'UTF-8 ' ): string
126
+ {
127
+ set_error_handler (function () { throw new \Exception (); });
128
+
129
+ try {
130
+ return mb_encode_numericentity ($ htmlContent , [0x80 , 0x10FFFF , 0 , 0x1FFFFF ], $ charset );
131
+ } catch (\Exception |\ValueError $ e ) {
132
+ try {
133
+ $ htmlContent = iconv ($ charset , 'UTF-8 ' , $ htmlContent );
134
+ $ htmlContent = mb_encode_numericentity ($ htmlContent , [0x80 , 0x10FFFF , 0 , 0x1FFFFF ], 'UTF-8 ' );
135
+ } catch (\Exception |\ValueError $ e ) {
136
+ }
137
+ return $ htmlContent ;
138
+ } finally {
139
+ restore_error_handler ();
140
+ }
108
141
}
109
142
}
0 commit comments