diff --git a/Tests/HtmlPageTest.php b/Tests/HtmlPageTest.php
index 1dbd50a..e74f6db 100644
--- a/Tests/HtmlPageTest.php
+++ b/Tests/HtmlPageTest.php
@@ -344,4 +344,27 @@ public function testSaveOnFileName()
$hp->save(vfsStream::url('root/save.html'));
$this->assertFileExists(vfsStream::url('root/save.html'));
}
+
+ public function testEmbeddedScriptWithHtml()
+ {
+ // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
+ // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
+ $html = <<
+
+
+ test
+
+
+
+
+
+
+
+END;
+ $hp = new HtmlPage($html);
+ $this->assertEquals($html . "\n", $hp->save());
+ }
}
diff --git a/src/Helpers.php b/src/Helpers.php
index f481168..af2c3c7 100644
--- a/src/Helpers.php
+++ b/src/Helpers.php
@@ -74,8 +74,15 @@ public static function cssArrayToString($array)
*/
public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
{
- $unsafeLibXml = \LIBXML_VERSION < 20900;
+
$html = '' . $html . '';
+ $d = self::loadHtml($html, $charset);
+ return $d->getElementsByTagName('body')->item(0);
+ }
+
+ public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument
+ {
+ $unsafeLibXml = \LIBXML_VERSION < 20900;
$current = libxml_use_internal_errors(true);
if($unsafeLibXml) {
$disableEntities = libxml_disable_entity_loader(true);
@@ -89,11 +96,14 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
) {
$html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset);
}
- @$d->loadHTML($html);
+ // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
+ // Option LIBXML_SCHEMA_CREATE seems to prevent this
+ // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
+ @$d->loadHTML($html, \LIBXML_SCHEMA_CREATE);
libxml_use_internal_errors($current);
if($unsafeLibXml) {
libxml_disable_entity_loader($disableEntities);
}
- return $d->getElementsByTagName('body')->item(0);
+ return $d;
}
}
diff --git a/src/HtmlPage.php b/src/HtmlPage.php
index 624c162..a1b6951 100644
--- a/src/HtmlPage.php
+++ b/src/HtmlPage.php
@@ -41,31 +41,12 @@ class HtmlPage
public function __construct($content = '', $url = '', $charset = 'UTF-8')
{
- $unsafeLibXml = \LIBXML_VERSION < 20900;
$this->charset = $charset;
$this->url = $url;
if ($content == '') {
$content = '';
}
- $current = libxml_use_internal_errors(true);
- if($unsafeLibXml) {
- $disableEntities = libxml_disable_entity_loader(true);
- }
-
- $this->dom = new \DOMDocument('1.0', $charset);
- $this->dom->validateOnParse = true;
-
-
- if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) {
- $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
- }
-
- @$this->dom->loadHTML($content);
-
- libxml_use_internal_errors($current);
- if($unsafeLibXml) {
- libxml_disable_entity_loader($disableEntities);
- }
+ $this->dom = Helpers::loadHtml($content, $charset);
$this->crawler = new HtmlPageCrawler($this->dom);
}