作者 lyh

gx

  1 +<?php
  2 +/**
  3 + * Website: http://sourceforge.net/projects/simplehtmldom/
  4 + * Additional projects: http://sourceforge.net/projects/debugobject/
  5 + * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
  6 + *
  7 + * Licensed under The MIT License
  8 + * See the LICENSE file in the project root for more information.
  9 + *
  10 + * Authors:
  11 + * S.C. Chen
  12 + * John Schlick
  13 + * Rus Carroll
  14 + * logmanoriginal
  15 + *
  16 + * Contributors:
  17 + * Yousuke Kumakura
  18 + * Vadim Voituk
  19 + * Antcs
  20 + *
  21 + * Version Rev. 1.9.1 (291)
  22 + */
  23 +
  24 +define('HDOM_TYPE_ELEMENT', 1);
  25 +define('HDOM_TYPE_COMMENT', 2);
  26 +define('HDOM_TYPE_TEXT', 3);
  27 +define('HDOM_TYPE_ENDTAG', 4);
  28 +define('HDOM_TYPE_ROOT', 5);
  29 +define('HDOM_TYPE_UNKNOWN', 6);
  30 +define('HDOM_QUOTE_DOUBLE', 0);
  31 +define('HDOM_QUOTE_SINGLE', 1);
  32 +define('HDOM_QUOTE_NO', 3);
  33 +define('HDOM_INFO_BEGIN', 0);
  34 +define('HDOM_INFO_END', 1);
  35 +define('HDOM_INFO_QUOTE', 2);
  36 +define('HDOM_INFO_SPACE', 3);
  37 +define('HDOM_INFO_TEXT', 4);
  38 +define('HDOM_INFO_INNER', 5);
  39 +define('HDOM_INFO_OUTER', 6);
  40 +define('HDOM_INFO_ENDSPACE', 7);
  41 +
  42 +defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  43 +defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
  44 +defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
  45 +defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
  46 +define('HDOM_SMARTY_AS_TEXT', 1);
  47 +
  48 +function file_get_html(
  49 + $url,
  50 + $use_include_path = false,
  51 + $context = null,
  52 + $offset = 0,
  53 + $maxLen = -1,
  54 + $lowercase = true,
  55 + $forceTagsClosed = true,
  56 + $target_charset = DEFAULT_TARGET_CHARSET,
  57 + $stripRN = true,
  58 + $defaultBRText = DEFAULT_BR_TEXT,
  59 + $defaultSpanText = DEFAULT_SPAN_TEXT)
  60 +{
  61 + if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
  62 +
  63 + $dom = new simple_html_dom(
  64 + null,
  65 + $lowercase,
  66 + $forceTagsClosed,
  67 + $target_charset,
  68 + $stripRN,
  69 + $defaultBRText,
  70 + $defaultSpanText
  71 + );
  72 +
  73 + /**
  74 + * For sourceforge users: uncomment the next line and comment the
  75 + * retrieve_url_contents line 2 lines down if it is not already done.
  76 + */
  77 + $arrContextOptions = [
  78 + 'ssl' => [
  79 + 'verify_peer' => false,
  80 + 'verify_peer_name' => false,
  81 + ]
  82 + ];
  83 +
  84 + $context = stream_context_create($arrContextOptions);
  85 + $contents = file_get_contents(
  86 + $url,
  87 + $use_include_path,
  88 + $context,
  89 + $offset,
  90 + $maxLen
  91 + );
  92 + // $contents = retrieve_url_contents($url);
  93 +
  94 + if (empty($contents) || strlen($contents) > $maxLen) {
  95 + $dom->clear();
  96 + return false;
  97 + }
  98 +
  99 + return $dom->load($contents, $lowercase, $stripRN);
  100 +}
  101 +
  102 +function str_get_html(
  103 + $str,
  104 + $lowercase = true,
  105 + $forceTagsClosed = true,
  106 + $target_charset = DEFAULT_TARGET_CHARSET,
  107 + $stripRN = true,
  108 + $defaultBRText = DEFAULT_BR_TEXT,
  109 + $defaultSpanText = DEFAULT_SPAN_TEXT)
  110 +{
  111 + $dom = new simple_html_dom(
  112 + null,
  113 + $lowercase,
  114 + $forceTagsClosed,
  115 + $target_charset,
  116 + $stripRN,
  117 + $defaultBRText,
  118 + $defaultSpanText
  119 + );
  120 +
  121 + if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
  122 + $dom->clear();
  123 + return false;
  124 + }
  125 +
  126 + return $dom->load($str, $lowercase, $stripRN);
  127 +}
  128 +
  129 +function dump_html_tree($node, $show_attr = true, $deep = 0)
  130 +{
  131 + $node->dump($node);
  132 +}
  133 +
  134 +class simple_html_dom_node
  135 +{
  136 + public $nodetype = HDOM_TYPE_TEXT;
  137 + public $tag = 'text';
  138 + public $attr = array();
  139 + public $children = array();
  140 + public $nodes = array();
  141 + public $parent = null;
  142 + public $_ = array();
  143 + public $tag_start = 0;
  144 + private $dom = null;
  145 +
  146 + function __construct($dom)
  147 + {
  148 + $this->dom = $dom;
  149 + $dom->nodes[] = $this;
  150 + }
  151 +
  152 + function __destruct()
  153 + {
  154 + $this->clear();
  155 + }
  156 +
  157 + function __toString()
  158 + {
  159 + return $this->outertext();
  160 + }
  161 +
  162 + function clear()
  163 + {
  164 + $this->dom = null;
  165 + $this->nodes = null;
  166 + $this->parent = null;
  167 + $this->children = null;
  168 + }
  169 +
  170 + function dump($show_attr = true, $depth = 0)
  171 + {
  172 + echo str_repeat("\t", $depth) . $this->tag;
  173 +
  174 + if ($show_attr && count($this->attr) > 0) {
  175 + echo '(';
  176 + foreach ($this->attr as $k => $v) {
  177 + echo "[$k]=>\"$v\", ";
  178 + }
  179 + echo ')';
  180 + }
  181 +
  182 + echo "\n";
  183 +
  184 + if ($this->nodes) {
  185 + foreach ($this->nodes as $node) {
  186 + $node->dump($show_attr, $depth + 1);
  187 + }
  188 + }
  189 + }
  190 +
  191 + function dump_node($echo = true)
  192 + {
  193 + $string = $this->tag;
  194 +
  195 + if (count($this->attr) > 0) {
  196 + $string .= '(';
  197 + foreach ($this->attr as $k => $v) {
  198 + $string .= "[$k]=>\"$v\", ";
  199 + }
  200 + $string .= ')';
  201 + }
  202 +
  203 + if (count($this->_) > 0) {
  204 + $string .= ' $_ (';
  205 + foreach ($this->_ as $k => $v) {
  206 + if (is_array($v)) {
  207 + $string .= "[$k]=>(";
  208 + foreach ($v as $k2 => $v2) {
  209 + $string .= "[$k2]=>\"$v2\", ";
  210 + }
  211 + $string .= ')';
  212 + } else {
  213 + $string .= "[$k]=>\"$v\", ";
  214 + }
  215 + }
  216 + $string .= ')';
  217 + }
  218 +
  219 + if (isset($this->text)) {
  220 + $string .= " text: ({$this->text})";
  221 + }
  222 +
  223 + $string .= ' HDOM_INNER_INFO: ';
  224 +
  225 + if (isset($node->_[HDOM_INFO_INNER])) {
  226 + $string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
  227 + } else {
  228 + $string .= ' NULL ';
  229 + }
  230 +
  231 + $string .= ' children: ' . count($this->children);
  232 + $string .= ' nodes: ' . count($this->nodes);
  233 + $string .= ' tag_start: ' . $this->tag_start;
  234 + $string .= "\n";
  235 +
  236 + if ($echo) {
  237 + echo $string;
  238 + return;
  239 + } else {
  240 + return $string;
  241 + }
  242 + }
  243 +
  244 + function parent($parent = null)
  245 + {
  246 + // I am SURE that this doesn't work properly.
  247 + // It fails to unset the current node from it's current parents nodes or
  248 + // children list first.
  249 + if ($parent !== null) {
  250 + $this->parent = $parent;
  251 + $this->parent->nodes[] = $this;
  252 + $this->parent->children[] = $this;
  253 + }
  254 +
  255 + return $this->parent;
  256 + }
  257 +
  258 + function has_child()
  259 + {
  260 + return !empty($this->children);
  261 + }
  262 +
  263 + function children($idx = -1)
  264 + {
  265 + if ($idx === -1) {
  266 + return $this->children;
  267 + }
  268 +
  269 + if (isset($this->children[$idx])) {
  270 + return $this->children[$idx];
  271 + }
  272 +
  273 + return null;
  274 + }
  275 +
  276 + function first_child()
  277 + {
  278 + if (count($this->children) > 0) {
  279 + return $this->children[0];
  280 + }
  281 + return null;
  282 + }
  283 +
  284 + function last_child()
  285 + {
  286 + if (count($this->children) > 0) {
  287 + return end($this->children);
  288 + }
  289 + return null;
  290 + }
  291 +
  292 + function next_sibling()
  293 + {
  294 + if ($this->parent === null) {
  295 + return null;
  296 + }
  297 +
  298 + $idx = array_search($this, $this->parent->children, true);
  299 +
  300 + if ($idx !== false && isset($this->parent->children[$idx + 1])) {
  301 + return $this->parent->children[$idx + 1];
  302 + }
  303 +
  304 + return null;
  305 + }
  306 +
  307 + function prev_sibling()
  308 + {
  309 + if ($this->parent === null) {
  310 + return null;
  311 + }
  312 +
  313 + $idx = array_search($this, $this->parent->children, true);
  314 +
  315 + if ($idx !== false && $idx > 0) {
  316 + return $this->parent->children[$idx - 1];
  317 + }
  318 +
  319 + return null;
  320 + }
  321 +
  322 + function find_ancestor_tag($tag)
  323 + {
  324 + global $debug_object;
  325 + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  326 +
  327 + if ($this->parent === null) {
  328 + return null;
  329 + }
  330 +
  331 + $ancestor = $this->parent;
  332 +
  333 + while (!is_null($ancestor)) {
  334 + if (is_object($debug_object)) {
  335 + $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
  336 + }
  337 +
  338 + if ($ancestor->tag === $tag) {
  339 + break;
  340 + }
  341 +
  342 + $ancestor = $ancestor->parent;
  343 + }
  344 +
  345 + return $ancestor;
  346 + }
  347 +
  348 + function innertext()
  349 + {
  350 + if (isset($this->_[HDOM_INFO_INNER])) {
  351 + return $this->_[HDOM_INFO_INNER];
  352 + }
  353 +
  354 + if (isset($this->_[HDOM_INFO_TEXT])) {
  355 + return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  356 + }
  357 +
  358 + $ret = '';
  359 +
  360 + foreach ($this->nodes as $n) {
  361 + $ret .= $n->outertext();
  362 + }
  363 +
  364 + return $ret;
  365 + }
  366 +
  367 + function outertext()
  368 + {
  369 + global $debug_object;
  370 +
  371 + if (is_object($debug_object)) {
  372 + $text = '';
  373 +
  374 + if ($this->tag === 'text') {
  375 + if (!empty($this->text)) {
  376 + $text = ' with text: ' . $this->text;
  377 + }
  378 + }
  379 +
  380 + $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
  381 + }
  382 +
  383 + if ($this->tag === 'root') {
  384 + return $this->innertext();
  385 + }
  386 +
  387 + // todo: What is the use of this callback? Remove?
  388 + if ($this->dom && $this->dom->callback !== null) {
  389 + call_user_func_array($this->dom->callback, array($this));
  390 + }
  391 +
  392 + if (isset($this->_[HDOM_INFO_OUTER])) {
  393 + return $this->_[HDOM_INFO_OUTER];
  394 + }
  395 +
  396 + if (isset($this->_[HDOM_INFO_TEXT])) {
  397 + return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  398 + }
  399 +
  400 + $ret = '';
  401 +
  402 + if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
  403 + $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
  404 + }
  405 +
  406 + if (isset($this->_[HDOM_INFO_INNER])) {
  407 + // todo: <br> should either never have HDOM_INFO_INNER or always
  408 + if ($this->tag !== 'br') {
  409 + $ret .= $this->_[HDOM_INFO_INNER];
  410 + }
  411 + } elseif ($this->nodes) {
  412 + foreach ($this->nodes as $n) {
  413 + $ret .= $this->convert_text($n->outertext());
  414 + }
  415 + }
  416 +
  417 + if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
  418 + $ret .= '</' . $this->tag . '>';
  419 + }
  420 +
  421 + return $ret;
  422 + }
  423 +
  424 + function text()
  425 + {
  426 + if (isset($this->_[HDOM_INFO_INNER])) {
  427 + return $this->_[HDOM_INFO_INNER];
  428 + }
  429 +
  430 + switch ($this->nodetype) {
  431 + case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  432 + case HDOM_TYPE_COMMENT: return '';
  433 + case HDOM_TYPE_UNKNOWN: return '';
  434 + }
  435 +
  436 + if (strcasecmp($this->tag, 'script') === 0) { return ''; }
  437 + if (strcasecmp($this->tag, 'style') === 0) { return ''; }
  438 +
  439 + $ret = '';
  440 +
  441 + // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
  442 + // for some span tags, and some p tags) $this->nodes is set to NULL.
  443 + // NOTE: This indicates that there is a problem where it's set to NULL
  444 + // without a clear happening.
  445 + // WHY is this happening?
  446 + if (!is_null($this->nodes)) {
  447 + foreach ($this->nodes as $n) {
  448 + // Start paragraph after a blank line
  449 + if ($n->tag === 'p') {
  450 + $ret = trim($ret) . "\n\n";
  451 + }
  452 +
  453 + $ret .= $this->convert_text($n->text());
  454 +
  455 + // If this node is a span... add a space at the end of it so
  456 + // multiple spans don't run into each other. This is plaintext
  457 + // after all.
  458 + if ($n->tag === 'span') {
  459 + $ret .= $this->dom->default_span_text;
  460 + }
  461 + }
  462 + }
  463 + return $ret;
  464 + }
  465 +
  466 + function xmltext()
  467 + {
  468 + $ret = $this->innertext();
  469 + $ret = str_ireplace('<![CDATA[', '', $ret);
  470 + $ret = str_replace(']]>', '', $ret);
  471 + return $ret;
  472 + }
  473 +
  474 + function makeup()
  475 + {
  476 + // text, comment, unknown
  477 + if (isset($this->_[HDOM_INFO_TEXT])) {
  478 + return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  479 + }
  480 +
  481 + $ret = '<' . $this->tag;
  482 + $i = -1;
  483 +
  484 + foreach ($this->attr as $key => $val) {
  485 + ++$i;
  486 +
  487 + // skip removed attribute
  488 + if ($val === null || $val === false) { continue; }
  489 +
  490 + $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
  491 +
  492 + //no value attr: nowrap, checked selected...
  493 + if ($val === true) {
  494 + $ret .= $key;
  495 + } else {
  496 + switch ($this->_[HDOM_INFO_QUOTE][$i])
  497 + {
  498 + case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
  499 + case HDOM_QUOTE_SINGLE: $quote = '\''; break;
  500 + default: $quote = '';
  501 + }
  502 +
  503 + $ret .= $key
  504 + . $this->_[HDOM_INFO_SPACE][$i][1]
  505 + . '='
  506 + . $this->_[HDOM_INFO_SPACE][$i][2]
  507 + . $quote
  508 + . $val
  509 + . $quote;
  510 + }
  511 + }
  512 +
  513 + $ret = $this->dom->restore_noise($ret);
  514 + return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
  515 + }
  516 +
  517 + function find($selector, $idx = null, $lowercase = false)
  518 + {
  519 + $selectors = $this->parse_selector($selector);
  520 + if (($count = count($selectors)) === 0) { return array(); }
  521 + $found_keys = array();
  522 +
  523 + // find each selector
  524 + for ($c = 0; $c < $count; ++$c) {
  525 + // The change on the below line was documented on the sourceforge
  526 + // code tracker id 2788009
  527 + // used to be: if (($levle=count($selectors[0]))===0) return array();
  528 + if (($levle = count($selectors[$c])) === 0) { return array(); }
  529 + if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
  530 +
  531 + $head = array($this->_[HDOM_INFO_BEGIN] => 1);
  532 + $cmd = ' '; // Combinator
  533 +
  534 + // handle descendant selectors, no recursive!
  535 + for ($l = 0; $l < $levle; ++$l) {
  536 + $ret = array();
  537 +
  538 + foreach ($head as $k => $v) {
  539 + $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
  540 + //PaperG - Pass this optional parameter on to the seek function.
  541 + $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
  542 + }
  543 +
  544 + $head = $ret;
  545 + $cmd = $selectors[$c][$l][4]; // Next Combinator
  546 + }
  547 +
  548 + foreach ($head as $k => $v) {
  549 + if (!isset($found_keys[$k])) {
  550 + $found_keys[$k] = 1;
  551 + }
  552 + }
  553 + }
  554 +
  555 + // sort keys
  556 + ksort($found_keys);
  557 +
  558 + $found = array();
  559 + foreach ($found_keys as $k => $v) {
  560 + $found[] = $this->dom->nodes[$k];
  561 + }
  562 +
  563 + // return nth-element or array
  564 + if (is_null($idx)) { return $found; }
  565 + elseif ($idx < 0) { $idx = count($found) + $idx; }
  566 + return (isset($found[$idx])) ? $found[$idx] : null;
  567 + }
  568 +
  569 + protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
  570 + {
  571 + global $debug_object;
  572 + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  573 +
  574 + list($tag, $id, $class, $attributes, $cmb) = $selector;
  575 + $nodes = array();
  576 +
  577 + if ($parent_cmd === ' ') { // Descendant Combinator
  578 + // Find parent closing tag if the current element doesn't have a closing
  579 + // tag (i.e. void element)
  580 + $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
  581 + if ($end == 0) {
  582 + $parent = $this->parent;
  583 + while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
  584 + $end -= 1;
  585 + $parent = $parent->parent;
  586 + }
  587 + $end += $parent->_[HDOM_INFO_END];
  588 + }
  589 +
  590 + // Get list of target nodes
  591 + $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
  592 + $nodes_count = $end - $nodes_start;
  593 + $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
  594 + } elseif ($parent_cmd === '>') { // Child Combinator
  595 + $nodes = $this->children;
  596 + } elseif ($parent_cmd === '+'
  597 + && $this->parent
  598 + && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
  599 + $index = array_search($this, $this->parent->children, true) + 1;
  600 + if ($index < count($this->parent->children))
  601 + $nodes[] = $this->parent->children[$index];
  602 + } elseif ($parent_cmd === '~'
  603 + && $this->parent
  604 + && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
  605 + $index = array_search($this, $this->parent->children, true);
  606 + $nodes = array_slice($this->parent->children, $index);
  607 + }
  608 +
  609 + // Go throgh each element starting at this element until the end tag
  610 + // Note: If this element is a void tag, any previous void element is
  611 + // skipped.
  612 + foreach($nodes as $node) {
  613 + $pass = true;
  614 +
  615 + // Skip root nodes
  616 + if(!$node->parent) {
  617 + $pass = false;
  618 + }
  619 +
  620 + // Handle 'text' selector
  621 + if($pass && $tag === 'text' && $node->tag === 'text') {
  622 + $ret[array_search($node, $this->dom->nodes, true)] = 1;
  623 + unset($node);
  624 + continue;
  625 + }
  626 +
  627 + // Skip if node isn't a child node (i.e. text nodes)
  628 + if($pass && !in_array($node, $node->parent->children, true)) {
  629 + $pass = false;
  630 + }
  631 +
  632 + // Skip if tag doesn't match
  633 + if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
  634 + $pass = false;
  635 + }
  636 +
  637 + // Skip if ID doesn't exist
  638 + if ($pass && $id !== '' && !isset($node->attr['id'])) {
  639 + $pass = false;
  640 + }
  641 +
  642 + // Check if ID matches
  643 + if ($pass && $id !== '' && isset($node->attr['id'])) {
  644 + // Note: Only consider the first ID (as browsers do)
  645 + $node_id = explode(' ', trim($node->attr['id']))[0];
  646 +
  647 + if($id !== $node_id) { $pass = false; }
  648 + }
  649 +
  650 + // Check if all class(es) exist
  651 + if ($pass && $class !== '' && is_array($class) && !empty($class)) {
  652 + if (isset($node->attr['class'])) {
  653 + $node_classes = explode(' ', $node->attr['class']);
  654 +
  655 + if ($lowercase) {
  656 + $node_classes = array_map('strtolower', $node_classes);
  657 + }
  658 +
  659 + foreach($class as $c) {
  660 + if(!in_array($c, $node_classes)) {
  661 + $pass = false;
  662 + break;
  663 + }
  664 + }
  665 + } else {
  666 + $pass = false;
  667 + }
  668 + }
  669 +
  670 + // Check attributes
  671 + if ($pass
  672 + && $attributes !== ''
  673 + && is_array($attributes)
  674 + && !empty($attributes)) {
  675 + foreach($attributes as $a) {
  676 + list (
  677 + $att_name,
  678 + $att_expr,
  679 + $att_val,
  680 + $att_inv,
  681 + $att_case_sensitivity
  682 + ) = $a;
  683 +
  684 + // Handle indexing attributes (i.e. "[2]")
  685 + /**
  686 + * Note: This is not supported by the CSS Standard but adds
  687 + * the ability to select items compatible to XPath (i.e.
  688 + * the 3rd element within it's parent).
  689 + *
  690 + * Note: This doesn't conflict with the CSS Standard which
  691 + * doesn't work on numeric attributes anyway.
  692 + */
  693 + if (is_numeric($att_name)
  694 + && $att_expr === ''
  695 + && $att_val === '') {
  696 + $count = 0;
  697 +
  698 + // Find index of current element in parent
  699 + foreach ($node->parent->children as $c) {
  700 + if ($c->tag === $node->tag) ++$count;
  701 + if ($c === $node) break;
  702 + }
  703 +
  704 + // If this is the correct node, continue with next
  705 + // attribute
  706 + if ($count === (int)$att_name) continue;
  707 + }
  708 +
  709 + // Check attribute availability
  710 + if ($att_inv) { // Attribute should NOT be set
  711 + if (isset($node->attr[$att_name])) {
  712 + $pass = false;
  713 + break;
  714 + }
  715 + } else { // Attribute should be set
  716 + // todo: "plaintext" is not a valid CSS selector!
  717 + if ($att_name !== 'plaintext'
  718 + && !isset($node->attr[$att_name])) {
  719 + $pass = false;
  720 + break;
  721 + }
  722 + }
  723 +
  724 + // Continue with next attribute if expression isn't defined
  725 + if ($att_expr === '') continue;
  726 +
  727 + // If they have told us that this is a "plaintext"
  728 + // search then we want the plaintext of the node - right?
  729 + // todo "plaintext" is not a valid CSS selector!
  730 + if ($att_name === 'plaintext') {
  731 + $nodeKeyValue = $node->text();
  732 + } else {
  733 + $nodeKeyValue = $node->attr[$att_name];
  734 + }
  735 +
  736 + if (is_object($debug_object)) {
  737 + $debug_object->debug_log(2,
  738 + 'testing node: '
  739 + . $node->tag
  740 + . ' for attribute: '
  741 + . $att_name
  742 + . $att_expr
  743 + . $att_val
  744 + . ' where nodes value is: '
  745 + . $nodeKeyValue
  746 + );
  747 + }
  748 +
  749 + // If lowercase is set, do a case insensitive test of
  750 + // the value of the selector.
  751 + if ($lowercase) {
  752 + $check = $this->match(
  753 + $att_expr,
  754 + strtolower($att_val),
  755 + strtolower($nodeKeyValue),
  756 + $att_case_sensitivity
  757 + );
  758 + } else {
  759 + $check = $this->match(
  760 + $att_expr,
  761 + $att_val,
  762 + $nodeKeyValue,
  763 + $att_case_sensitivity
  764 + );
  765 + }
  766 +
  767 + if (is_object($debug_object)) {
  768 + $debug_object->debug_log(2,
  769 + 'after match: '
  770 + . ($check ? 'true' : 'false')
  771 + );
  772 + }
  773 +
  774 + if (!$check) {
  775 + $pass = false;
  776 + break;
  777 + }
  778 + }
  779 + }
  780 +
  781 + // Found a match. Add to list and clear node
  782 + if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
  783 + unset($node);
  784 + }
  785 + // It's passed by reference so this is actually what this function returns.
  786 + if (is_object($debug_object)) {
  787 + $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
  788 + }
  789 + }
  790 +
  791 + protected function match($exp, $pattern, $value, $case_sensitivity)
  792 + {
  793 + global $debug_object;
  794 + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
  795 +
  796 + if ($case_sensitivity === 'i') {
  797 + $pattern = strtolower($pattern);
  798 + $value = strtolower($value);
  799 + }
  800 +
  801 + switch ($exp) {
  802 + case '=':
  803 + return ($value === $pattern);
  804 + case '!=':
  805 + return ($value !== $pattern);
  806 + case '^=':
  807 + return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
  808 + case '$=':
  809 + return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
  810 + case '*=':
  811 + return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
  812 + case '|=':
  813 + /**
  814 + * [att|=val]
  815 + *
  816 + * Represents an element with the att attribute, its value
  817 + * either being exactly "val" or beginning with "val"
  818 + * immediately followed by "-" (U+002D).
  819 + */
  820 + return strpos($value, $pattern) === 0;
  821 + case '~=':
  822 + /**
  823 + * [att~=val]
  824 + *
  825 + * Represents an element with the att attribute whose value is a
  826 + * whitespace-separated list of words, one of which is exactly
  827 + * "val". If "val" contains whitespace, it will never represent
  828 + * anything (since the words are separated by spaces). Also if
  829 + * "val" is the empty string, it will never represent anything.
  830 + */
  831 + return in_array($pattern, explode(' ', trim($value)), true);
  832 + }
  833 + return false;
  834 + }
  835 +
  836 + protected function parse_selector($selector_string)
  837 + {
  838 + global $debug_object;
  839 + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  840 +
  841 + /**
  842 + * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
  843 + *
  844 + * Paperg: Add the colon to the attribute, so that it properly finds
  845 + * <tag attr:ibute="something" > like google does.
  846 + *
  847 + * Note: if you try to look at this attribute, you MUST use getAttribute
  848 + * since $dom->x:y will fail the php syntax check.
  849 + *
  850 + * Notice the \[ starting the attribute? and the @? following? This
  851 + * implies that an attribute can begin with an @ sign that is not
  852 + * captured. This implies that an html attribute specifier may start
  853 + * with an @ sign that is NOT captured by the expression. Farther study
  854 + * is required to determine of this should be documented or removed.
  855 + *
  856 + * Matches selectors in this order:
  857 + *
  858 + * [0] - full match
  859 + *
  860 + * [1] - tag name
  861 + * ([\w:\*-]*)
  862 + * Matches the tag name consisting of zero or more words, colons,
  863 + * asterisks and hyphens.
  864 + *
  865 + * [2] - id name
  866 + * (?:\#([\w-]+))
  867 + * Optionally matches a id name, consisting of an "#" followed by
  868 + * the id name (one or more words and hyphens).
  869 + *
  870 + * [3] - class names (including dots)
  871 + * (?:\.([\w\.-]+))?
  872 + * Optionally matches a list of classs, consisting of an "."
  873 + * followed by the class name (one or more words and hyphens)
  874 + * where multiple classes can be chained (i.e. ".foo.bar.baz")
  875 + *
  876 + * [4] - attributes
  877 + * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
  878 + * Optionally matches the attributes list
  879 + *
  880 + * [5] - separator
  881 + * ([\/, >+~]+)
  882 + * Matches the selector list separator
  883 + */
  884 + // phpcs:ignore Generic.Files.LineLength
  885 + $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
  886 +
  887 + preg_match_all(
  888 + $pattern,
  889 + trim($selector_string) . ' ', // Add final ' ' as pseudo separator
  890 + $matches,
  891 + PREG_SET_ORDER
  892 + );
  893 +
  894 + if (is_object($debug_object)) {
  895 + $debug_object->debug_log(2, 'Matches Array: ', $matches);
  896 + }
  897 +
  898 + $selectors = array();
  899 + $result = array();
  900 +
  901 + foreach ($matches as $m) {
  902 + $m[0] = trim($m[0]);
  903 +
  904 + // Skip NoOps
  905 + if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
  906 +
  907 + // Convert to lowercase
  908 + if ($this->dom->lowercase) {
  909 + $m[1] = strtolower($m[1]);
  910 + }
  911 +
  912 + // Extract classes
  913 + if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
  914 +
  915 + /* Extract attributes (pattern based on the pattern above!)
  916 +
  917 + * [0] - full match
  918 + * [1] - attribute name
  919 + * [2] - attribute expression
  920 + * [3] - attribute value
  921 + * [4] - case sensitivity
  922 + *
  923 + * Note: Attributes can be negated with a "!" prefix to their name
  924 + */
  925 + if($m[4] !== '') {
  926 + preg_match_all(
  927 + "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
  928 + trim($m[4]),
  929 + $attributes,
  930 + PREG_SET_ORDER
  931 + );
  932 +
  933 + // Replace element by array
  934 + $m[4] = array();
  935 +
  936 + foreach($attributes as $att) {
  937 + // Skip empty matches
  938 + if(trim($att[0]) === '') { continue; }
  939 +
  940 + $inverted = (isset($att[1][0]) && $att[1][0] === '!');
  941 + $m[4][] = array(
  942 + $inverted ? substr($att[1], 1) : $att[1], // Name
  943 + (isset($att[2])) ? $att[2] : '', // Expression
  944 + (isset($att[3])) ? $att[3] : '', // Value
  945 + $inverted, // Inverted Flag
  946 + (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
  947 + );
  948 + }
  949 + }
  950 +
  951 + // Sanitize Separator
  952 + if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
  953 + $m[5] = ' ';
  954 + } else { // Other Separator
  955 + $m[5] = trim($m[5]);
  956 + }
  957 +
  958 + // Clear Separator if it's a Selector List
  959 + if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
  960 +
  961 + // Remove full match before adding to results
  962 + array_shift($m);
  963 + $result[] = $m;
  964 +
  965 + if ($is_list) { // Selector List
  966 + $selectors[] = $result;
  967 + $result = array();
  968 + }
  969 + }
  970 +
  971 + if (count($result) > 0) { $selectors[] = $result; }
  972 + return $selectors;
  973 + }
  974 +
  975 + function __get($name)
  976 + {
  977 + if (isset($this->attr[$name])) {
  978 + return $this->convert_text($this->attr[$name]);
  979 + }
  980 + switch ($name) {
  981 + case 'outertext': return $this->outertext();
  982 + case 'innertext': return $this->innertext();
  983 + case 'plaintext': return $this->text();
  984 + case 'xmltext': return $this->xmltext();
  985 + default: return array_key_exists($name, $this->attr);
  986 + }
  987 + }
  988 +
  989 + function __set($name, $value)
  990 + {
  991 + global $debug_object;
  992 + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  993 +
  994 + switch ($name) {
  995 + case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
  996 + case 'innertext':
  997 + if (isset($this->_[HDOM_INFO_TEXT])) {
  998 + return $this->_[HDOM_INFO_TEXT] = $value;
  999 + }
  1000 + return $this->_[HDOM_INFO_INNER] = $value;
  1001 + }
  1002 +
  1003 + if (!isset($this->attr[$name])) {
  1004 + $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
  1005 + $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  1006 + }
  1007 +
  1008 + $this->attr[$name] = $value;
  1009 + }
  1010 +
  1011 + function __isset($name)
  1012 + {
  1013 + switch ($name) {
  1014 + case 'outertext': return true;
  1015 + case 'innertext': return true;
  1016 + case 'plaintext': return true;
  1017 + }
  1018 + //no value attr: nowrap, checked selected...
  1019 + return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
  1020 + }
  1021 +
  1022 + function __unset($name)
  1023 + {
  1024 + if (isset($this->attr[$name])) { unset($this->attr[$name]); }
  1025 + }
  1026 +
  1027 + function convert_text($text)
  1028 + {
  1029 + global $debug_object;
  1030 + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1031 +
  1032 + $converted_text = $text;
  1033 +
  1034 + $sourceCharset = '';
  1035 + $targetCharset = '';
  1036 +
  1037 + if ($this->dom) {
  1038 + $sourceCharset = strtoupper($this->dom->_charset);
  1039 + $targetCharset = strtoupper($this->dom->_target_charset);
  1040 + }
  1041 +
  1042 + if (is_object($debug_object)) {
  1043 + $debug_object->debug_log(3,
  1044 + 'source charset: '
  1045 + . $sourceCharset
  1046 + . ' target charaset: '
  1047 + . $targetCharset
  1048 + );
  1049 + }
  1050 +
  1051 + if (!empty($sourceCharset)
  1052 + && !empty($targetCharset)
  1053 + && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
  1054 + // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
  1055 + if ((strcasecmp($targetCharset, 'UTF-8') == 0)
  1056 + && ($this->is_utf8($text))) {
  1057 + $converted_text = $text;
  1058 + } else {
  1059 + $converted_text = iconv($sourceCharset, $targetCharset, $text);
  1060 + }
  1061 + }
  1062 +
  1063 + // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
  1064 + if ($targetCharset === 'UTF-8') {
  1065 + if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
  1066 + $converted_text = substr($converted_text, 3);
  1067 + }
  1068 +
  1069 + if (substr($converted_text, -3) === "\xef\xbb\xbf") {
  1070 + $converted_text = substr($converted_text, 0, -3);
  1071 + }
  1072 + }
  1073 +
  1074 + return $converted_text;
  1075 + }
  1076 +
  1077 + static function is_utf8($str)
  1078 + {
  1079 + $c = 0; $b = 0;
  1080 + $bits = 0;
  1081 + $len = strlen($str);
  1082 + for($i = 0; $i < $len; $i++) {
  1083 + $c = ord($str[$i]);
  1084 + if($c > 128) {
  1085 + if(($c >= 254)) { return false; }
  1086 + elseif($c >= 252) { $bits = 6; }
  1087 + elseif($c >= 248) { $bits = 5; }
  1088 + elseif($c >= 240) { $bits = 4; }
  1089 + elseif($c >= 224) { $bits = 3; }
  1090 + elseif($c >= 192) { $bits = 2; }
  1091 + else { return false; }
  1092 + if(($i + $bits) > $len) { return false; }
  1093 + while($bits > 1) {
  1094 + $i++;
  1095 + $b = ord($str[$i]);
  1096 + if($b < 128 || $b > 191) { return false; }
  1097 + $bits--;
  1098 + }
  1099 + }
  1100 + }
  1101 + return true;
  1102 + }
  1103 +
  1104 + function get_display_size()
  1105 + {
  1106 + global $debug_object;
  1107 +
  1108 + $width = -1;
  1109 + $height = -1;
  1110 +
  1111 + if ($this->tag !== 'img') {
  1112 + return false;
  1113 + }
  1114 +
  1115 + // See if there is aheight or width attribute in the tag itself.
  1116 + if (isset($this->attr['width'])) {
  1117 + $width = $this->attr['width'];
  1118 + }
  1119 +
  1120 + if (isset($this->attr['height'])) {
  1121 + $height = $this->attr['height'];
  1122 + }
  1123 +
  1124 + // Now look for an inline style.
  1125 + if (isset($this->attr['style'])) {
  1126 + // Thanks to user gnarf from stackoverflow for this regular expression.
  1127 + $attributes = array();
  1128 +
  1129 + preg_match_all(
  1130 + '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
  1131 + $this->attr['style'],
  1132 + $matches,
  1133 + PREG_SET_ORDER
  1134 + );
  1135 +
  1136 + foreach ($matches as $match) {
  1137 + $attributes[$match[1]] = $match[2];
  1138 + }
  1139 +
  1140 + // If there is a width in the style attributes:
  1141 + if (isset($attributes['width']) && $width == -1) {
  1142 + // check that the last two characters are px (pixels)
  1143 + if (strtolower(substr($attributes['width'], -2)) === 'px') {
  1144 + $proposed_width = substr($attributes['width'], 0, -2);
  1145 + // Now make sure that it's an integer and not something stupid.
  1146 + if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
  1147 + $width = $proposed_width;
  1148 + }
  1149 + }
  1150 + }
  1151 +
  1152 + // If there is a width in the style attributes:
  1153 + if (isset($attributes['height']) && $height == -1) {
  1154 + // check that the last two characters are px (pixels)
  1155 + if (strtolower(substr($attributes['height'], -2)) == 'px') {
  1156 + $proposed_height = substr($attributes['height'], 0, -2);
  1157 + // Now make sure that it's an integer and not something stupid.
  1158 + if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
  1159 + $height = $proposed_height;
  1160 + }
  1161 + }
  1162 + }
  1163 +
  1164 + }
  1165 +
  1166 + // Future enhancement:
  1167 + // Look in the tag to see if there is a class or id specified that has
  1168 + // a height or width attribute to it.
  1169 +
  1170 + // Far future enhancement
  1171 + // Look at all the parent tags of this image to see if they specify a
  1172 + // class or id that has an img selector that specifies a height or width
  1173 + // Note that in this case, the class or id will have the img subselector
  1174 + // for it to apply to the image.
  1175 +
  1176 + // ridiculously far future development
  1177 + // If the class or id is specified in a SEPARATE css file thats not on
  1178 + // the page, go get it and do what we were just doing for the ones on
  1179 + // the page.
  1180 +
  1181 + $result = array(
  1182 + 'height' => $height,
  1183 + 'width' => $width
  1184 + );
  1185 +
  1186 + return $result;
  1187 + }
  1188 +
  1189 + function save($filepath = '')
  1190 + {
  1191 + $ret = $this->outertext();
  1192 +
  1193 + if ($filepath !== '') {
  1194 + file_put_contents($filepath, $ret, LOCK_EX);
  1195 + }
  1196 +
  1197 + return $ret;
  1198 + }
  1199 +
  1200 + function addClass($class)
  1201 + {
  1202 + if (is_string($class)) {
  1203 + $class = explode(' ', $class);
  1204 + }
  1205 +
  1206 + if (is_array($class)) {
  1207 + foreach($class as $c) {
  1208 + if (isset($this->class)) {
  1209 + if ($this->hasClass($c)) {
  1210 + continue;
  1211 + } else {
  1212 + $this->class .= ' ' . $c;
  1213 + }
  1214 + } else {
  1215 + $this->class = $c;
  1216 + }
  1217 + }
  1218 + } else {
  1219 + if (is_object($debug_object)) {
  1220 + $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
  1221 + }
  1222 + }
  1223 + }
  1224 +
  1225 + function hasClass($class)
  1226 + {
  1227 + if (is_string($class)) {
  1228 + if (isset($this->class)) {
  1229 + return in_array($class, explode(' ', $this->class), true);
  1230 + }
  1231 + } else {
  1232 + if (is_object($debug_object)) {
  1233 + $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
  1234 + }
  1235 + }
  1236 +
  1237 + return false;
  1238 + }
  1239 +
  1240 + function removeClass($class = null)
  1241 + {
  1242 + if (!isset($this->class)) {
  1243 + return;
  1244 + }
  1245 +
  1246 + if (is_null($class)) {
  1247 + $this->removeAttribute('class');
  1248 + return;
  1249 + }
  1250 +
  1251 + if (is_string($class)) {
  1252 + $class = explode(' ', $class);
  1253 + }
  1254 +
  1255 + if (is_array($class)) {
  1256 + $class = array_diff(explode(' ', $this->class), $class);
  1257 + if (empty($class)) {
  1258 + $this->removeAttribute('class');
  1259 + } else {
  1260 + $this->class = implode(' ', $class);
  1261 + }
  1262 + }
  1263 + }
  1264 +
  1265 + function getAllAttributes()
  1266 + {
  1267 + return $this->attr;
  1268 + }
  1269 +
  1270 + function getAttribute($name)
  1271 + {
  1272 + return $this->__get($name);
  1273 + }
  1274 +
  1275 + function setAttribute($name, $value)
  1276 + {
  1277 + $this->__set($name, $value);
  1278 + }
  1279 +
  1280 + function hasAttribute($name)
  1281 + {
  1282 + return $this->__isset($name);
  1283 + }
  1284 +
  1285 + function removeAttribute($name)
  1286 + {
  1287 + $this->__set($name, null);
  1288 + }
  1289 +
  1290 + function remove()
  1291 + {
  1292 + if ($this->parent) {
  1293 + $this->parent->removeChild($this);
  1294 + }
  1295 + }
  1296 +
  1297 + function removeChild($node)
  1298 + {
  1299 + $nidx = array_search($node, $this->nodes, true);
  1300 + $cidx = array_search($node, $this->children, true);
  1301 + $didx = array_search($node, $this->dom->nodes, true);
  1302 +
  1303 + if ($nidx !== false && $cidx !== false && $didx !== false) {
  1304 +
  1305 + foreach($node->children as $child) {
  1306 + $node->removeChild($child);
  1307 + }
  1308 +
  1309 + foreach($node->nodes as $entity) {
  1310 + $enidx = array_search($entity, $node->nodes, true);
  1311 + $edidx = array_search($entity, $node->dom->nodes, true);
  1312 +
  1313 + if ($enidx !== false && $edidx !== false) {
  1314 + unset($node->nodes[$enidx]);
  1315 + unset($node->dom->nodes[$edidx]);
  1316 + }
  1317 + }
  1318 +
  1319 + unset($this->nodes[$nidx]);
  1320 + unset($this->children[$cidx]);
  1321 + unset($this->dom->nodes[$didx]);
  1322 +
  1323 + $node->clear();
  1324 +
  1325 + }
  1326 + }
  1327 +
  1328 + function getElementById($id)
  1329 + {
  1330 + return $this->find("#$id", 0);
  1331 + }
  1332 +
  1333 + function getElementsById($id, $idx = null)
  1334 + {
  1335 + return $this->find("#$id", $idx);
  1336 + }
  1337 +
  1338 + function getElementByTagName($name)
  1339 + {
  1340 + return $this->find($name, 0);
  1341 + }
  1342 +
  1343 + function getElementsByTagName($name, $idx = null)
  1344 + {
  1345 + return $this->find($name, $idx);
  1346 + }
  1347 +
  1348 + function parentNode()
  1349 + {
  1350 + return $this->parent();
  1351 + }
  1352 +
  1353 + function childNodes($idx = -1)
  1354 + {
  1355 + return $this->children($idx);
  1356 + }
  1357 +
  1358 + function firstChild()
  1359 + {
  1360 + return $this->first_child();
  1361 + }
  1362 +
  1363 + function lastChild()
  1364 + {
  1365 + return $this->last_child();
  1366 + }
  1367 +
  1368 + function nextSibling()
  1369 + {
  1370 + return $this->next_sibling();
  1371 + }
  1372 +
  1373 + function previousSibling()
  1374 + {
  1375 + return $this->prev_sibling();
  1376 + }
  1377 +
  1378 + function hasChildNodes()
  1379 + {
  1380 + return $this->has_child();
  1381 + }
  1382 +
  1383 + function nodeName()
  1384 + {
  1385 + return $this->tag;
  1386 + }
  1387 +
  1388 + function appendChild($node)
  1389 + {
  1390 + $node->parent($this);
  1391 + return $node;
  1392 + }
  1393 +
  1394 +}
  1395 +
  1396 +class simple_html_dom
  1397 +{
  1398 + public $root = null;
  1399 + public $nodes = array();
  1400 + public $callback = null;
  1401 + public $lowercase = false;
  1402 + public $original_size;
  1403 + public $size;
  1404 +
  1405 + protected $pos;
  1406 + protected $doc;
  1407 + protected $char;
  1408 +
  1409 + protected $cursor;
  1410 + protected $parent;
  1411 + protected $noise = array();
  1412 + protected $token_blank = " \t\r\n";
  1413 + protected $token_equal = ' =/>';
  1414 + protected $token_slash = " />\r\n\t";
  1415 + protected $token_attr = ' >';
  1416 +
  1417 + public $_charset = '';
  1418 + public $_target_charset = '';
  1419 +
  1420 + protected $default_br_text = '';
  1421 +
  1422 + public $default_span_text = '';
  1423 +
  1424 + protected $self_closing_tags = array(
  1425 + 'area' => 1,
  1426 + 'base' => 1,
  1427 + 'br' => 1,
  1428 + 'col' => 1,
  1429 + 'embed' => 1,
  1430 + 'hr' => 1,
  1431 + 'img' => 1,
  1432 + 'input' => 1,
  1433 + 'link' => 1,
  1434 + 'meta' => 1,
  1435 + 'param' => 1,
  1436 + 'source' => 1,
  1437 + 'track' => 1,
  1438 + 'wbr' => 1
  1439 + );
  1440 + protected $block_tags = array(
  1441 + 'body' => 1,
  1442 + 'div' => 1,
  1443 + 'form' => 1,
  1444 + 'root' => 1,
  1445 + 'span' => 1,
  1446 + 'table' => 1
  1447 + );
  1448 + protected $optional_closing_tags = array(
  1449 + // Not optional, see
  1450 + // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
  1451 + 'b' => array('b' => 1),
  1452 + 'dd' => array('dd' => 1, 'dt' => 1),
  1453 + // Not optional, see
  1454 + // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
  1455 + 'dl' => array('dd' => 1, 'dt' => 1),
  1456 + 'dt' => array('dd' => 1, 'dt' => 1),
  1457 + 'li' => array('li' => 1),
  1458 + 'optgroup' => array('optgroup' => 1, 'option' => 1),
  1459 + 'option' => array('optgroup' => 1, 'option' => 1),
  1460 + 'p' => array('p' => 1),
  1461 + 'rp' => array('rp' => 1, 'rt' => 1),
  1462 + 'rt' => array('rp' => 1, 'rt' => 1),
  1463 + 'td' => array('td' => 1, 'th' => 1),
  1464 + 'th' => array('td' => 1, 'th' => 1),
  1465 + 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
  1466 + );
  1467 +
  1468 + function __construct(
  1469 + $str = null,
  1470 + $lowercase = true,
  1471 + $forceTagsClosed = true,
  1472 + $target_charset = DEFAULT_TARGET_CHARSET,
  1473 + $stripRN = true,
  1474 + $defaultBRText = DEFAULT_BR_TEXT,
  1475 + $defaultSpanText = DEFAULT_SPAN_TEXT,
  1476 + $options = 0)
  1477 + {
  1478 + if ($str) {
  1479 + if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
  1480 + $this->load_file($str);
  1481 + } else {
  1482 + $this->load(
  1483 + $str,
  1484 + $lowercase,
  1485 + $stripRN,
  1486 + $defaultBRText,
  1487 + $defaultSpanText,
  1488 + $options
  1489 + );
  1490 + }
  1491 + }
  1492 + // Forcing tags to be closed implies that we don't trust the html, but
  1493 + // it can lead to parsing errors if we SHOULD trust the html.
  1494 + if (!$forceTagsClosed) {
  1495 + $this->optional_closing_array = array();
  1496 + }
  1497 +
  1498 + $this->_target_charset = $target_charset;
  1499 + }
  1500 +
  1501 + function __destruct()
  1502 + {
  1503 + $this->clear();
  1504 + }
  1505 +
  1506 + function load(
  1507 + $str,
  1508 + $lowercase = true,
  1509 + $stripRN = true,
  1510 + $defaultBRText = DEFAULT_BR_TEXT,
  1511 + $defaultSpanText = DEFAULT_SPAN_TEXT,
  1512 + $options = 0)
  1513 + {
  1514 + global $debug_object;
  1515 +
  1516 + // prepare
  1517 + $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
  1518 +
  1519 + // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
  1520 + // Script tags removal now preceeds style tag removal.
  1521 + // strip out <script> tags
  1522 + $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
  1523 + $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
  1524 +
  1525 + // strip out the \r \n's if we are told to.
  1526 + if ($stripRN) {
  1527 + $this->doc = str_replace("\r", ' ', $this->doc);
  1528 + $this->doc = str_replace("\n", ' ', $this->doc);
  1529 +
  1530 + // set the length of content since we have changed it.
  1531 + $this->size = strlen($this->doc);
  1532 + }
  1533 +
  1534 + // strip out cdata
  1535 + $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
  1536 + // strip out comments
  1537 + $this->remove_noise("'<!--(.*?)-->'is");
  1538 + // strip out <style> tags
  1539 + $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
  1540 + $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
  1541 + // strip out preformatted tags
  1542 + $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
  1543 + // strip out server side scripts
  1544 + $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
  1545 +
  1546 + if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
  1547 + $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
  1548 + }
  1549 +
  1550 + // parsing
  1551 + $this->parse();
  1552 + // end
  1553 + $this->root->_[HDOM_INFO_END] = $this->cursor;
  1554 + $this->parse_charset();
  1555 +
  1556 + // make load function chainable
  1557 + return $this;
  1558 + }
  1559 +
  1560 + function load_file()
  1561 + {
  1562 + $args = func_get_args();
  1563 +
  1564 + if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
  1565 + $this->load($doc, true);
  1566 + } else {
  1567 + return false;
  1568 + }
  1569 + }
  1570 +
  1571 + function set_callback($function_name)
  1572 + {
  1573 + $this->callback = $function_name;
  1574 + }
  1575 +
  1576 + function remove_callback()
  1577 + {
  1578 + $this->callback = null;
  1579 + }
  1580 +
  1581 + function save($filepath = '')
  1582 + {
  1583 + $ret = $this->root->innertext();
  1584 + if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
  1585 + return $ret;
  1586 + }
  1587 +
  1588 + function find($selector, $idx = null, $lowercase = false)
  1589 + {
  1590 + return $this->root->find($selector, $idx, $lowercase);
  1591 + }
  1592 +
  1593 + function clear()
  1594 + {
  1595 + if (isset($this->nodes)) {
  1596 + foreach ($this->nodes as $n) {
  1597 + $n->clear();
  1598 + $n = null;
  1599 + }
  1600 + }
  1601 +
  1602 + // This add next line is documented in the sourceforge repository.
  1603 + // 2977248 as a fix for ongoing memory leaks that occur even with the
  1604 + // use of clear.
  1605 + if (isset($this->children)) {
  1606 + foreach ($this->children as $n) {
  1607 + $n->clear();
  1608 + $n = null;
  1609 + }
  1610 + }
  1611 +
  1612 + if (isset($this->parent)) {
  1613 + $this->parent->clear();
  1614 + unset($this->parent);
  1615 + }
  1616 +
  1617 + if (isset($this->root)) {
  1618 + $this->root->clear();
  1619 + unset($this->root);
  1620 + }
  1621 +
  1622 + unset($this->doc);
  1623 + unset($this->noise);
  1624 + }
  1625 +
  1626 + function dump($show_attr = true)
  1627 + {
  1628 + $this->root->dump($show_attr);
  1629 + }
  1630 +
  1631 + protected function prepare(
  1632 + $str, $lowercase = true,
  1633 + $defaultBRText = DEFAULT_BR_TEXT,
  1634 + $defaultSpanText = DEFAULT_SPAN_TEXT)
  1635 + {
  1636 + $this->clear();
  1637 +
  1638 + $this->doc = trim($str);
  1639 + $this->size = strlen($this->doc);
  1640 + $this->original_size = $this->size; // original size of the html
  1641 + $this->pos = 0;
  1642 + $this->cursor = 1;
  1643 + $this->noise = array();
  1644 + $this->nodes = array();
  1645 + $this->lowercase = $lowercase;
  1646 + $this->default_br_text = $defaultBRText;
  1647 + $this->default_span_text = $defaultSpanText;
  1648 + $this->root = new simple_html_dom_node($this);
  1649 + $this->root->tag = 'root';
  1650 + $this->root->_[HDOM_INFO_BEGIN] = -1;
  1651 + $this->root->nodetype = HDOM_TYPE_ROOT;
  1652 + $this->parent = $this->root;
  1653 + if ($this->size > 0) { $this->char = $this->doc[0]; }
  1654 + }
  1655 +
  1656 + protected function parse()
  1657 + {
  1658 + while (true) {
  1659 + // Read next tag if there is no text between current position and the
  1660 + // next opening tag.
  1661 + if (($s = $this->copy_until_char('<')) === '') {
  1662 + if($this->read_tag()) {
  1663 + continue;
  1664 + } else {
  1665 + return true;
  1666 + }
  1667 + }
  1668 +
  1669 + // Add a text node for text between tags
  1670 + $node = new simple_html_dom_node($this);
  1671 + ++$this->cursor;
  1672 + $node->_[HDOM_INFO_TEXT] = $s;
  1673 + $this->link_nodes($node, false);
  1674 + }
  1675 + }
  1676 +
  1677 + protected function parse_charset()
  1678 + {
  1679 + global $debug_object;
  1680 +
  1681 + $charset = null;
  1682 +
  1683 + if (function_exists('get_last_retrieve_url_contents_content_type')) {
  1684 + $contentTypeHeader = get_last_retrieve_url_contents_content_type();
  1685 + $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
  1686 + if ($success) {
  1687 + $charset = $matches[1];
  1688 + if (is_object($debug_object)) {
  1689 + $debug_object->debug_log(2,
  1690 + 'header content-type found charset of: '
  1691 + . $charset
  1692 + );
  1693 + }
  1694 + }
  1695 + }
  1696 +
  1697 + if (empty($charset)) {
  1698 + // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
  1699 + $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
  1700 +
  1701 + if (!empty($el)) {
  1702 + $fullvalue = $el->content;
  1703 + if (is_object($debug_object)) {
  1704 + $debug_object->debug_log(2,
  1705 + 'meta content-type tag found'
  1706 + . $fullvalue
  1707 + );
  1708 + }
  1709 +
  1710 + if (!empty($fullvalue)) {
  1711 + $success = preg_match(
  1712 + '/charset=(.+)/i',
  1713 + $fullvalue,
  1714 + $matches
  1715 + );
  1716 +
  1717 + if ($success) {
  1718 + $charset = $matches[1];
  1719 + } else {
  1720 + // If there is a meta tag, and they don't specify the
  1721 + // character set, research says that it's typically
  1722 + // ISO-8859-1
  1723 + if (is_object($debug_object)) {
  1724 + $debug_object->debug_log(2,
  1725 + 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
  1726 + );
  1727 + }
  1728 +
  1729 + $charset = 'ISO-8859-1';
  1730 + }
  1731 + }
  1732 + }
  1733 + }
  1734 +
  1735 + if (empty($charset)) {
  1736 + // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
  1737 + if ($meta = $this->root->find('meta[charset]', 0)) {
  1738 + $charset = $meta->charset;
  1739 + if (is_object($debug_object)) {
  1740 + $debug_object->debug_log(2, 'meta charset: ' . $charset);
  1741 + }
  1742 + }
  1743 + }
  1744 +
  1745 + if (empty($charset)) {
  1746 + // Try to guess the charset based on the content
  1747 + // Requires Multibyte String (mbstring) support (optional)
  1748 + if (function_exists('mb_detect_encoding')) {
  1749 + /**
  1750 + * mb_detect_encoding() is not intended to distinguish between
  1751 + * charsets, especially single-byte charsets. Its primary
  1752 + * purpose is to detect which multibyte encoding is in use,
  1753 + * i.e. UTF-8, UTF-16, shift-JIS, etc.
  1754 + *
  1755 + * -- https://bugs.php.net/bug.php?id=38138
  1756 + *
  1757 + * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
  1758 + * always result in CP1251/ISO-8859-5 and vice versa.
  1759 + *
  1760 + * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
  1761 + * to stay compatible.
  1762 + */
  1763 + $encoding = mb_detect_encoding(
  1764 + $this->doc,
  1765 + array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
  1766 + );
  1767 +
  1768 + if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
  1769 + // Due to a limitation of mb_detect_encoding
  1770 + // 'CP1251'/'ISO-8859-5' will be detected as
  1771 + // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
  1772 + // which case we can simply assume it is the other charset.
  1773 + if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
  1774 + $encoding = 'CP1251';
  1775 + }
  1776 + }
  1777 +
  1778 + if ($encoding !== false) {
  1779 + $charset = $encoding;
  1780 + if (is_object($debug_object)) {
  1781 + $debug_object->debug_log(2, 'mb_detect: ' . $charset);
  1782 + }
  1783 + }
  1784 + }
  1785 + }
  1786 +
  1787 + if (empty($charset)) {
  1788 + // Assume it's UTF-8 as it is the most likely charset to be used
  1789 + $charset = 'UTF-8';
  1790 + if (is_object($debug_object)) {
  1791 + $debug_object->debug_log(2, 'No match found, assume ' . $charset);
  1792 + }
  1793 + }
  1794 +
  1795 + // Since CP1252 is a superset, if we get one of it's subsets, we want
  1796 + // it instead.
  1797 + if ((strtolower($charset) == 'iso-8859-1')
  1798 + || (strtolower($charset) == 'latin1')
  1799 + || (strtolower($charset) == 'latin-1')) {
  1800 + $charset = 'CP1252';
  1801 + if (is_object($debug_object)) {
  1802 + $debug_object->debug_log(2,
  1803 + 'replacing ' . $charset . ' with CP1252 as its a superset'
  1804 + );
  1805 + }
  1806 + }
  1807 +
  1808 + if (is_object($debug_object)) {
  1809 + $debug_object->debug_log(1, 'EXIT - ' . $charset);
  1810 + }
  1811 +
  1812 + return $this->_charset = $charset;
  1813 + }
  1814 +
  1815 + protected function read_tag()
  1816 + {
  1817 + // Set end position if no further tags found
  1818 + if ($this->char !== '<') {
  1819 + $this->root->_[HDOM_INFO_END] = $this->cursor;
  1820 + return false;
  1821 + }
  1822 +
  1823 + $begin_tag_pos = $this->pos;
  1824 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1825 +
  1826 + // end tag
  1827 + if ($this->char === '/') {
  1828 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1829 +
  1830 + // Skip whitespace in end tags (i.e. in "</ html>")
  1831 + $this->skip($this->token_blank);
  1832 + $tag = $this->copy_until_char('>');
  1833 +
  1834 + // Skip attributes in end tags
  1835 + if (($pos = strpos($tag, ' ')) !== false) {
  1836 + $tag = substr($tag, 0, $pos);
  1837 + }
  1838 +
  1839 + $parent_lower = strtolower($this->parent->tag);
  1840 + $tag_lower = strtolower($tag);
  1841 +
  1842 + // The end tag is supposed to close the parent tag. Handle situations
  1843 + // when it doesn't
  1844 + if ($parent_lower !== $tag_lower) {
  1845 + // Parent tag does not have to be closed necessarily (optional closing tag)
  1846 + // Current tag is a block tag, so it may close an ancestor
  1847 + if (isset($this->optional_closing_tags[$parent_lower])
  1848 + && isset($this->block_tags[$tag_lower])) {
  1849 +
  1850 + $this->parent->_[HDOM_INFO_END] = 0;
  1851 + $org_parent = $this->parent;
  1852 +
  1853 + // Traverse ancestors to find a matching opening tag
  1854 + // Stop at root node
  1855 + while (($this->parent->parent)
  1856 + && strtolower($this->parent->tag) !== $tag_lower
  1857 + ){
  1858 + $this->parent = $this->parent->parent;
  1859 + }
  1860 +
  1861 + // If we don't have a match add current tag as text node
  1862 + if (strtolower($this->parent->tag) !== $tag_lower) {
  1863 + $this->parent = $org_parent; // restore origonal parent
  1864 +
  1865 + if ($this->parent->parent) {
  1866 + $this->parent = $this->parent->parent;
  1867 + }
  1868 +
  1869 + $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1870 + return $this->as_text_node($tag);
  1871 + }
  1872 + } elseif (($this->parent->parent)
  1873 + && isset($this->block_tags[$tag_lower])
  1874 + ) {
  1875 + // Grandparent exists and current tag is a block tag, so our
  1876 + // parent doesn't have an end tag
  1877 + $this->parent->_[HDOM_INFO_END] = 0; // No end tag
  1878 + $org_parent = $this->parent;
  1879 +
  1880 + // Traverse ancestors to find a matching opening tag
  1881 + // Stop at root node
  1882 + while (($this->parent->parent)
  1883 + && strtolower($this->parent->tag) !== $tag_lower
  1884 + ) {
  1885 + $this->parent = $this->parent->parent;
  1886 + }
  1887 +
  1888 + // If we don't have a match add current tag as text node
  1889 + if (strtolower($this->parent->tag) !== $tag_lower) {
  1890 + $this->parent = $org_parent; // restore origonal parent
  1891 + $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1892 + return $this->as_text_node($tag);
  1893 + }
  1894 + } elseif (($this->parent->parent)
  1895 + && strtolower($this->parent->parent->tag) === $tag_lower
  1896 + ) { // Grandparent exists and current tag closes it
  1897 + $this->parent->_[HDOM_INFO_END] = 0;
  1898 + $this->parent = $this->parent->parent;
  1899 + } else { // Random tag, add as text node
  1900 + return $this->as_text_node($tag);
  1901 + }
  1902 + }
  1903 +
  1904 + // Set end position of parent tag to current cursor position
  1905 + $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1906 +
  1907 + if ($this->parent->parent) {
  1908 + $this->parent = $this->parent->parent;
  1909 + }
  1910 +
  1911 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1912 + return true;
  1913 + }
  1914 +
  1915 + // start tag
  1916 + $node = new simple_html_dom_node($this);
  1917 + $node->_[HDOM_INFO_BEGIN] = $this->cursor;
  1918 + ++$this->cursor;
  1919 + $tag = $this->copy_until($this->token_slash); // Get tag name
  1920 + $node->tag_start = $begin_tag_pos;
  1921 +
  1922 + // doctype, cdata & comments...
  1923 + // <!DOCTYPE html>
  1924 + // <![CDATA[ ... ]]>
  1925 + // <!-- Comment -->
  1926 + if (isset($tag[0]) && $tag[0] === '!') {
  1927 + $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
  1928 +
  1929 + if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
  1930 + $node->nodetype = HDOM_TYPE_COMMENT;
  1931 + $node->tag = 'comment';
  1932 + } else { // Could be doctype or CDATA but we don't care
  1933 + $node->nodetype = HDOM_TYPE_UNKNOWN;
  1934 + $node->tag = 'unknown';
  1935 + }
  1936 +
  1937 + if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
  1938 +
  1939 + $this->link_nodes($node, true);
  1940 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1941 + return true;
  1942 + }
  1943 +
  1944 + // The start tag cannot contain another start tag, if so add as text
  1945 + // i.e. "<<html>"
  1946 + if ($pos = strpos($tag, '<') !== false) {
  1947 + $tag = '<' . substr($tag, 0, -1);
  1948 + $node->_[HDOM_INFO_TEXT] = $tag;
  1949 + $this->link_nodes($node, false);
  1950 + $this->char = $this->doc[--$this->pos]; // prev
  1951 + return true;
  1952 + }
  1953 +
  1954 + // Handle invalid tag names (i.e. "<html#doc>")
  1955 + if (!preg_match('/^\w[\w:-]*$/', $tag)) {
  1956 + $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
  1957 +
  1958 + // Next char is the beginning of a new tag, don't touch it.
  1959 + if ($this->char === '<') {
  1960 + $this->link_nodes($node, false);
  1961 + return true;
  1962 + }
  1963 +
  1964 + // Next char closes current tag, add and be done with it.
  1965 + if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
  1966 + $this->link_nodes($node, false);
  1967 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1968 + return true;
  1969 + }
  1970 +
  1971 + // begin tag, add new node
  1972 + $node->nodetype = HDOM_TYPE_ELEMENT;
  1973 + $tag_lower = strtolower($tag);
  1974 + $node->tag = ($this->lowercase) ? $tag_lower : $tag;
  1975 +
  1976 + // handle optional closing tags
  1977 + if (isset($this->optional_closing_tags[$tag_lower])) {
  1978 + // Traverse ancestors to close all optional closing tags
  1979 + while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
  1980 + $this->parent->_[HDOM_INFO_END] = 0;
  1981 + $this->parent = $this->parent->parent;
  1982 + }
  1983 + $node->parent = $this->parent;
  1984 + }
  1985 +
  1986 + $guard = 0; // prevent infinity loop
  1987 +
  1988 + // [0] Space between tag and first attribute
  1989 + $space = array($this->copy_skip($this->token_blank), '', '');
  1990 +
  1991 + // attributes
  1992 + do {
  1993 + // Everything until the first equal sign should be the attribute name
  1994 + $name = $this->copy_until($this->token_equal);
  1995 +
  1996 + if ($name === '' && $this->char !== null && $space[0] === '') {
  1997 + break;
  1998 + }
  1999 +
  2000 + if ($guard === $this->pos) { // Escape infinite loop
  2001 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2002 + continue;
  2003 + }
  2004 +
  2005 + $guard = $this->pos;
  2006 +
  2007 + // handle endless '<'
  2008 + // Out of bounds before the tag ended
  2009 + if ($this->pos >= $this->size - 1 && $this->char !== '>') {
  2010 + $node->nodetype = HDOM_TYPE_TEXT;
  2011 + $node->_[HDOM_INFO_END] = 0;
  2012 + $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
  2013 + $node->tag = 'text';
  2014 + $this->link_nodes($node, false);
  2015 + return true;
  2016 + }
  2017 +
  2018 + // handle mismatch '<'
  2019 + // Attributes cannot start after opening tag
  2020 + if ($this->doc[$this->pos - 1] == '<') {
  2021 + $node->nodetype = HDOM_TYPE_TEXT;
  2022 + $node->tag = 'text';
  2023 + $node->attr = array();
  2024 + $node->_[HDOM_INFO_END] = 0;
  2025 + $node->_[HDOM_INFO_TEXT] = substr(
  2026 + $this->doc,
  2027 + $begin_tag_pos,
  2028 + $this->pos - $begin_tag_pos - 1
  2029 + );
  2030 + $this->pos -= 2;
  2031 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2032 + $this->link_nodes($node, false);
  2033 + return true;
  2034 + }
  2035 +
  2036 + if ($name !== '/' && $name !== '') { // this is a attribute name
  2037 + // [1] Whitespace after attribute name
  2038 + $space[1] = $this->copy_skip($this->token_blank);
  2039 +
  2040 + $name = $this->restore_noise($name); // might be a noisy name
  2041 +
  2042 + if ($this->lowercase) { $name = strtolower($name); }
  2043 +
  2044 + if ($this->char === '=') { // attribute with value
  2045 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2046 + $this->parse_attr($node, $name, $space); // get attribute value
  2047 + } else {
  2048 + //no value attr: nowrap, checked selected...
  2049 + $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
  2050 + $node->attr[$name] = true;
  2051 + if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
  2052 + }
  2053 +
  2054 + $node->_[HDOM_INFO_SPACE][] = $space;
  2055 +
  2056 + // prepare for next attribute
  2057 + $space = array(
  2058 + $this->copy_skip($this->token_blank),
  2059 + '',
  2060 + ''
  2061 + );
  2062 + } else { // no more attributes
  2063 + break;
  2064 + }
  2065 + } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
  2066 +
  2067 + $this->link_nodes($node, true);
  2068 + $node->_[HDOM_INFO_ENDSPACE] = $space[0];
  2069 +
  2070 + // handle empty tags (i.e. "<div/>")
  2071 + if ($this->copy_until_char('>') === '/') {
  2072 + $node->_[HDOM_INFO_ENDSPACE] .= '/';
  2073 + $node->_[HDOM_INFO_END] = 0;
  2074 + } else {
  2075 + // reset parent
  2076 + if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
  2077 + $this->parent = $node;
  2078 + }
  2079 + }
  2080 +
  2081 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2082 +
  2083 + // If it's a BR tag, we need to set it's text to the default text.
  2084 + // This way when we see it in plaintext, we can generate formatting that the user wants.
  2085 + // since a br tag never has sub nodes, this works well.
  2086 + if ($node->tag === 'br') {
  2087 + $node->_[HDOM_INFO_INNER] = $this->default_br_text;
  2088 + }
  2089 +
  2090 + return true;
  2091 + }
  2092 +
  2093 + protected function parse_attr($node, $name, &$space)
  2094 + {
  2095 + $is_duplicate = isset($node->attr[$name]);
  2096 +
  2097 + if (!$is_duplicate) // Copy whitespace between "=" and value
  2098 + $space[2] = $this->copy_skip($this->token_blank);
  2099 +
  2100 + switch ($this->char) {
  2101 + case '"':
  2102 + $quote_type = HDOM_QUOTE_DOUBLE;
  2103 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2104 + $value = $this->copy_until_char('"');
  2105 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2106 + break;
  2107 + case '\'':
  2108 + $quote_type = HDOM_QUOTE_SINGLE;
  2109 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2110 + $value = $this->copy_until_char('\'');
  2111 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2112 + break;
  2113 + default:
  2114 + $quote_type = HDOM_QUOTE_NO;
  2115 + $value = $this->copy_until($this->token_attr);
  2116 + }
  2117 +
  2118 + $value = $this->restore_noise($value);
  2119 +
  2120 + // PaperG: Attributes should not have \r or \n in them, that counts as
  2121 + // html whitespace.
  2122 + $value = str_replace("\r", '', $value);
  2123 + $value = str_replace("\n", '', $value);
  2124 +
  2125 + // PaperG: If this is a "class" selector, lets get rid of the preceeding
  2126 + // and trailing space since some people leave it in the multi class case.
  2127 + if ($name === 'class') {
  2128 + $value = trim($value);
  2129 + }
  2130 +
  2131 + if (!$is_duplicate) {
  2132 + $node->_[HDOM_INFO_QUOTE][] = $quote_type;
  2133 + $node->attr[$name] = $value;
  2134 + }
  2135 + }
  2136 +
  2137 + protected function link_nodes(&$node, $is_child)
  2138 + {
  2139 + $node->parent = $this->parent;
  2140 + $this->parent->nodes[] = $node;
  2141 + if ($is_child) {
  2142 + $this->parent->children[] = $node;
  2143 + }
  2144 + }
  2145 +
  2146 + protected function as_text_node($tag)
  2147 + {
  2148 + $node = new simple_html_dom_node($this);
  2149 + ++$this->cursor;
  2150 + $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
  2151 + $this->link_nodes($node, false);
  2152 + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2153 + return true;
  2154 + }
  2155 +
  2156 + protected function skip($chars)
  2157 + {
  2158 + $this->pos += strspn($this->doc, $chars, $this->pos);
  2159 + $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2160 + }
  2161 +
  2162 + protected function copy_skip($chars)
  2163 + {
  2164 + $pos = $this->pos;
  2165 + $len = strspn($this->doc, $chars, $pos);
  2166 + $this->pos += $len;
  2167 + $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2168 + if ($len === 0) { return ''; }
  2169 + return substr($this->doc, $pos, $len);
  2170 + }
  2171 +
  2172 + protected function copy_until($chars)
  2173 + {
  2174 + $pos = $this->pos;
  2175 + $len = strcspn($this->doc, $chars, $pos);
  2176 + $this->pos += $len;
  2177 + $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  2178 + return substr($this->doc, $pos, $len);
  2179 + }
  2180 +
  2181 + protected function copy_until_char($char)
  2182 + {
  2183 + if ($this->char === null) { return ''; }
  2184 +
  2185 + if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
  2186 + $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
  2187 + $this->char = null;
  2188 + $this->pos = $this->size;
  2189 + return $ret;
  2190 + }
  2191 +
  2192 + if ($pos === $this->pos) { return ''; }
  2193 +
  2194 + $pos_old = $this->pos;
  2195 + $this->char = $this->doc[$pos];
  2196 + $this->pos = $pos;
  2197 + return substr($this->doc, $pos_old, $pos - $pos_old);
  2198 + }
  2199 +
  2200 + protected function remove_noise($pattern, $remove_tag = false)
  2201 + {
  2202 + global $debug_object;
  2203 + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  2204 +
  2205 + $count = preg_match_all(
  2206 + $pattern,
  2207 + $this->doc,
  2208 + $matches,
  2209 + PREG_SET_ORDER | PREG_OFFSET_CAPTURE
  2210 + );
  2211 +
  2212 + for ($i = $count - 1; $i > -1; --$i) {
  2213 + $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
  2214 +
  2215 + if (is_object($debug_object)) {
  2216 + $debug_object->debug_log(2, 'key is: ' . $key);
  2217 + }
  2218 +
  2219 + $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
  2220 + $this->noise[$key] = $matches[$i][$idx][0];
  2221 + $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
  2222 + }
  2223 +
  2224 + // reset the length of content
  2225 + $this->size = strlen($this->doc);
  2226 +
  2227 + if ($this->size > 0) {
  2228 + $this->char = $this->doc[0];
  2229 + }
  2230 + }
  2231 +
  2232 + function restore_noise($text)
  2233 + {
  2234 + global $debug_object;
  2235 + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  2236 +
  2237 + while (($pos = strpos($text, '___noise___')) !== false) {
  2238 + // Sometimes there is a broken piece of markup, and we don't GET the
  2239 + // pos+11 etc... token which indicates a problem outside of us...
  2240 +
  2241 + // todo: "___noise___1000" (or any number with four or more digits)
  2242 + // in the DOM causes an infinite loop which could be utilized by
  2243 + // malicious software
  2244 + if (strlen($text) > $pos + 15) {
  2245 + $key = '___noise___'
  2246 + . $text[$pos + 11]
  2247 + . $text[$pos + 12]
  2248 + . $text[$pos + 13]
  2249 + . $text[$pos + 14]
  2250 + . $text[$pos + 15];
  2251 +
  2252 + if (is_object($debug_object)) {
  2253 + $debug_object->debug_log(2, 'located key of: ' . $key);
  2254 + }
  2255 +
  2256 + if (isset($this->noise[$key])) {
  2257 + $text = substr($text, 0, $pos)
  2258 + . $this->noise[$key]
  2259 + . substr($text, $pos + 16);
  2260 + } else {
  2261 + // do this to prevent an infinite loop.
  2262 + $text = substr($text, 0, $pos)
  2263 + . 'UNDEFINED NOISE FOR KEY: '
  2264 + . $key
  2265 + . substr($text, $pos + 16);
  2266 + }
  2267 + } else {
  2268 + // There is no valid key being given back to us... We must get
  2269 + // rid of the ___noise___ or we will have a problem.
  2270 + $text = substr($text, 0, $pos)
  2271 + . 'NO NUMERIC NOISE KEY'
  2272 + . substr($text, $pos + 11);
  2273 + }
  2274 + }
  2275 + return $text;
  2276 + }
  2277 +
  2278 + function search_noise($text)
  2279 + {
  2280 + global $debug_object;
  2281 + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  2282 +
  2283 + foreach($this->noise as $noiseElement) {
  2284 + if (strpos($noiseElement, $text) !== false) {
  2285 + return $noiseElement;
  2286 + }
  2287 + }
  2288 + }
  2289 +
  2290 + function __toString()
  2291 + {
  2292 + return $this->root->innertext();
  2293 + }
  2294 +
  2295 + function __get($name)
  2296 + {
  2297 + switch ($name) {
  2298 + case 'outertext':
  2299 + return $this->root->innertext();
  2300 + case 'innertext':
  2301 + return $this->root->innertext();
  2302 + case 'plaintext':
  2303 + return $this->root->text();
  2304 + case 'charset':
  2305 + return $this->_charset;
  2306 + case 'target_charset':
  2307 + return $this->_target_charset;
  2308 + }
  2309 + }
  2310 +
  2311 + function childNodes($idx = -1)
  2312 + {
  2313 + return $this->root->childNodes($idx);
  2314 + }
  2315 +
  2316 + function firstChild()
  2317 + {
  2318 + return $this->root->first_child();
  2319 + }
  2320 +
  2321 + function lastChild()
  2322 + {
  2323 + return $this->root->last_child();
  2324 + }
  2325 +
  2326 + function createElement($name, $value = null)
  2327 + {
  2328 + return @str_get_html("<$name>$value</$name>")->firstChild();
  2329 + }
  2330 +
  2331 + function createTextNode($value)
  2332 + {
  2333 + return @end(str_get_html($value)->nodes);
  2334 + }
  2335 +
  2336 + function getElementById($id)
  2337 + {
  2338 + return $this->find("#$id", 0);
  2339 + }
  2340 +
  2341 + function getElementsById($id, $idx = null)
  2342 + {
  2343 + return $this->find("#$id", $idx);
  2344 + }
  2345 +
  2346 + function getElementByTagName($name)
  2347 + {
  2348 + return $this->find($name, 0);
  2349 + }
  2350 +
  2351 + function getElementsByTagName($name, $idx = -1)
  2352 + {
  2353 + return $this->find($name, $idx);
  2354 + }
  2355 +
  2356 + function loadFile()
  2357 + {
  2358 + $args = func_get_args();
  2359 + $this->load_file($args);
  2360 + }
  2361 +}
@@ -157,56 +157,33 @@ class TranslateLogic extends BaseLogic @@ -157,56 +157,33 @@ class TranslateLogic extends BaseLogic
157 * @time :2023/11/22 10:02 157 * @time :2023/11/22 10:02
158 */ 158 */
159 public function getUrlRead($url){ 159 public function getUrlRead($url){
160 - $contextOptions = [  
161 - 'ssl' => [  
162 - 'verify_peer' => false,  
163 - 'verify_peer_name' => false,  
164 - ],  
165 - ];  
166 - $context = stream_context_create($contextOptions);  
167 - $sourceCode = file_get_contents($url, false, $context);  
168 - if(!$sourceCode){  
169 - $this->fail('当前url不存在');  
170 - }  
171 - // 过滤掉具有 "change-language-cont" 类的元素  
172 - $pattern = '/<div\b[^>]*\sclass=[\'"]([^\'"]*change-language-cont[^\'"]*)[\'"][^>]*>(.*?)<\/div>/is';  
173 - $sourceCode = preg_replace($pattern, '', $sourceCode);  
174 - $pattern = '/<div\b[^>]*\sclass=[\'"]([^\'"]*change-language-title[^\'"]*)[\'"][^>]*>(.*?)<\/div>/is';  
175 - $sourceCode = preg_replace($pattern, '', $sourceCode);  
176 - $pattern = '/<style\b[^>]*>(.*?)<\/style>/s'; // 定义匹配`<style>`标签及其内容的正则表达式  
177 - $strippedContent = preg_replace($pattern, '', $sourceCode); // 删除`<style>`标签及其内容  
178 - $pattern = '/<script\b[^>]*>(.*?)<\/script>/s'; // 定义匹配`<script>`标签及其内容的正则表达式  
179 - $strippedContent = preg_replace($pattern, '', $strippedContent); // 删除`<script>`标签及其内容  
180 - $pattern = '/<link\b[^>]*>/'; // 定义匹配 `<link>` 标签的正则表达式  
181 - $strippedContent = preg_replace($pattern, '', $strippedContent); // 删除 `<link>` 标签  
182 - $pattern = '/>([^<]+)</'; // 定义匹配中间内容不是标签的正则表达式  
183 - $matches = array();  
184 - preg_match_all($pattern, $strippedContent, $matches);  
185 - $textContentArray = array_filter($matches[1], function($item) {  
186 - return !empty(trim($item));  
187 - });  
188 - // 过滤掉包含逗号加换行的内容  
189 - $textContentArray = array_filter($textContentArray, function($item) {  
190 - return strpos($item, ',') === false && strpos($item, PHP_EOL) === false;  
191 - });  
192 - $contentData = [];  
193 - foreach ($textContentArray as $v){  
194 - $content = trim($v);  
195 - $trimmedString = preg_replace('/\s+/', ' ', $content);  
196 - $contentData[] = $trimmedString;  
197 - }  
198 - $textContentArray = array_unique($textContentArray);  
199 - $contentData = array_values($textContentArray);  
200 - $pattern = '/<meta\s+[^>]*name=[\'"](keywords|description)[\'"][^>]*content=[\'"]([^\'"]+)[\'"]>/i'; // 匹配 name 为 "keywords" 或 "description" 的 meta 标签的正则表达式  
201 - $matches = array();  
202 - preg_match_all($pattern, $strippedContent, $matches);  
203 - $metaData = array();  
204 - foreach ($matches[2] as $index => $content) {  
205 - if(!empty(trim($content))){  
206 - $metaData[] = $content; 160 + $dom = file_get_html($url);
  161 + $texts = $dom->find("text");
  162 + $description = $dom->find("meta[name=description]",0);
  163 + $keywords = $dom->find("meta[name=keywords]",0);
  164 + // 组装需要翻译的内容 HTML内文案、meta description、meta keywords
  165 + $need_tran = [];
  166 + foreach ($texts as $k=>$text) {
  167 + $tag= $text->parent()->tag;
  168 + if (in_array($tag, ['script', 'style', 'root'])){
  169 + continue;
  170 + }
  171 + $string = trim($text->text());
  172 + if (empty($string)){
  173 + continue;
  174 + }
  175 + $country_class = '';
  176 + if (method_exists($text->parent()->parent(),"find") && $text->parent()->parent()->find("b")) {
  177 + $country_class = $text->parent()->parent()->find("b",0)->class;
  178 + }
  179 + if(FALSE !== strpos($country_class, 'country-flag')) {
  180 + continue;
207 } 181 }
  182 + $need_tran[] = htmlspecialchars_decode(html_entity_decode($string));
208 } 183 }
209 - $data = array_merge($metaData, $contentData); 184 + $need_tran[] = $description->attr['content'];
  185 + $need_tran[] = $keywords->attr['content'];
  186 + return $need_tran;
210 return $data; 187 return $data;
211 } 188 }
212 189