正在显示
2 个修改的文件
包含
2388 行增加
和
50 行删除
app/Helper/simple_html_dom.php
0 → 100644
| 1 | +<?php | ||
| 2 | +/** | ||
| 3 | + * Website: http://sourceforge.net/projects/simplehtmldom/ | ||
| 4 | + * Additional projects: http://sourceforge.net/projects/debugobject/ | ||
| 5 | + * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) | ||
| 6 | + * | ||
| 7 | + * Licensed under The MIT License | ||
| 8 | + * See the LICENSE file in the project root for more information. | ||
| 9 | + * | ||
| 10 | + * Authors: | ||
| 11 | + * S.C. Chen | ||
| 12 | + * John Schlick | ||
| 13 | + * Rus Carroll | ||
| 14 | + * logmanoriginal | ||
| 15 | + * | ||
| 16 | + * Contributors: | ||
| 17 | + * Yousuke Kumakura | ||
| 18 | + * Vadim Voituk | ||
| 19 | + * Antcs | ||
| 20 | + * | ||
| 21 | + * Version Rev. 1.9.1 (291) | ||
| 22 | + */ | ||
| 23 | + | ||
| 24 | +define('HDOM_TYPE_ELEMENT', 1); | ||
| 25 | +define('HDOM_TYPE_COMMENT', 2); | ||
| 26 | +define('HDOM_TYPE_TEXT', 3); | ||
| 27 | +define('HDOM_TYPE_ENDTAG', 4); | ||
| 28 | +define('HDOM_TYPE_ROOT', 5); | ||
| 29 | +define('HDOM_TYPE_UNKNOWN', 6); | ||
| 30 | +define('HDOM_QUOTE_DOUBLE', 0); | ||
| 31 | +define('HDOM_QUOTE_SINGLE', 1); | ||
| 32 | +define('HDOM_QUOTE_NO', 3); | ||
| 33 | +define('HDOM_INFO_BEGIN', 0); | ||
| 34 | +define('HDOM_INFO_END', 1); | ||
| 35 | +define('HDOM_INFO_QUOTE', 2); | ||
| 36 | +define('HDOM_INFO_SPACE', 3); | ||
| 37 | +define('HDOM_INFO_TEXT', 4); | ||
| 38 | +define('HDOM_INFO_INNER', 5); | ||
| 39 | +define('HDOM_INFO_OUTER', 6); | ||
| 40 | +define('HDOM_INFO_ENDSPACE', 7); | ||
| 41 | + | ||
| 42 | +defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); | ||
| 43 | +defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); | ||
| 44 | +defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); | ||
| 45 | +defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); | ||
| 46 | +define('HDOM_SMARTY_AS_TEXT', 1); | ||
| 47 | + | ||
| 48 | +function file_get_html( | ||
| 49 | + $url, | ||
| 50 | + $use_include_path = false, | ||
| 51 | + $context = null, | ||
| 52 | + $offset = 0, | ||
| 53 | + $maxLen = -1, | ||
| 54 | + $lowercase = true, | ||
| 55 | + $forceTagsClosed = true, | ||
| 56 | + $target_charset = DEFAULT_TARGET_CHARSET, | ||
| 57 | + $stripRN = true, | ||
| 58 | + $defaultBRText = DEFAULT_BR_TEXT, | ||
| 59 | + $defaultSpanText = DEFAULT_SPAN_TEXT) | ||
| 60 | +{ | ||
| 61 | + if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } | ||
| 62 | + | ||
| 63 | + $dom = new simple_html_dom( | ||
| 64 | + null, | ||
| 65 | + $lowercase, | ||
| 66 | + $forceTagsClosed, | ||
| 67 | + $target_charset, | ||
| 68 | + $stripRN, | ||
| 69 | + $defaultBRText, | ||
| 70 | + $defaultSpanText | ||
| 71 | + ); | ||
| 72 | + | ||
| 73 | + /** | ||
| 74 | + * For sourceforge users: uncomment the next line and comment the | ||
| 75 | + * retrieve_url_contents line 2 lines down if it is not already done. | ||
| 76 | + */ | ||
| 77 | + $arrContextOptions = [ | ||
| 78 | + 'ssl' => [ | ||
| 79 | + 'verify_peer' => false, | ||
| 80 | + 'verify_peer_name' => false, | ||
| 81 | + ] | ||
| 82 | + ]; | ||
| 83 | + | ||
| 84 | + $context = stream_context_create($arrContextOptions); | ||
| 85 | + $contents = file_get_contents( | ||
| 86 | + $url, | ||
| 87 | + $use_include_path, | ||
| 88 | + $context, | ||
| 89 | + $offset, | ||
| 90 | + $maxLen | ||
| 91 | + ); | ||
| 92 | + // $contents = retrieve_url_contents($url); | ||
| 93 | + | ||
| 94 | + if (empty($contents) || strlen($contents) > $maxLen) { | ||
| 95 | + $dom->clear(); | ||
| 96 | + return false; | ||
| 97 | + } | ||
| 98 | + | ||
| 99 | + return $dom->load($contents, $lowercase, $stripRN); | ||
| 100 | +} | ||
| 101 | + | ||
| 102 | +function str_get_html( | ||
| 103 | + $str, | ||
| 104 | + $lowercase = true, | ||
| 105 | + $forceTagsClosed = true, | ||
| 106 | + $target_charset = DEFAULT_TARGET_CHARSET, | ||
| 107 | + $stripRN = true, | ||
| 108 | + $defaultBRText = DEFAULT_BR_TEXT, | ||
| 109 | + $defaultSpanText = DEFAULT_SPAN_TEXT) | ||
| 110 | +{ | ||
| 111 | + $dom = new simple_html_dom( | ||
| 112 | + null, | ||
| 113 | + $lowercase, | ||
| 114 | + $forceTagsClosed, | ||
| 115 | + $target_charset, | ||
| 116 | + $stripRN, | ||
| 117 | + $defaultBRText, | ||
| 118 | + $defaultSpanText | ||
| 119 | + ); | ||
| 120 | + | ||
| 121 | + if (empty($str) || strlen($str) > MAX_FILE_SIZE) { | ||
| 122 | + $dom->clear(); | ||
| 123 | + return false; | ||
| 124 | + } | ||
| 125 | + | ||
| 126 | + return $dom->load($str, $lowercase, $stripRN); | ||
| 127 | +} | ||
| 128 | + | ||
| 129 | +function dump_html_tree($node, $show_attr = true, $deep = 0) | ||
| 130 | +{ | ||
| 131 | + $node->dump($node); | ||
| 132 | +} | ||
| 133 | + | ||
| 134 | +class simple_html_dom_node | ||
| 135 | +{ | ||
| 136 | + public $nodetype = HDOM_TYPE_TEXT; | ||
| 137 | + public $tag = 'text'; | ||
| 138 | + public $attr = array(); | ||
| 139 | + public $children = array(); | ||
| 140 | + public $nodes = array(); | ||
| 141 | + public $parent = null; | ||
| 142 | + public $_ = array(); | ||
| 143 | + public $tag_start = 0; | ||
| 144 | + private $dom = null; | ||
| 145 | + | ||
| 146 | + function __construct($dom) | ||
| 147 | + { | ||
| 148 | + $this->dom = $dom; | ||
| 149 | + $dom->nodes[] = $this; | ||
| 150 | + } | ||
| 151 | + | ||
| 152 | + function __destruct() | ||
| 153 | + { | ||
| 154 | + $this->clear(); | ||
| 155 | + } | ||
| 156 | + | ||
| 157 | + function __toString() | ||
| 158 | + { | ||
| 159 | + return $this->outertext(); | ||
| 160 | + } | ||
| 161 | + | ||
| 162 | + function clear() | ||
| 163 | + { | ||
| 164 | + $this->dom = null; | ||
| 165 | + $this->nodes = null; | ||
| 166 | + $this->parent = null; | ||
| 167 | + $this->children = null; | ||
| 168 | + } | ||
| 169 | + | ||
| 170 | + function dump($show_attr = true, $depth = 0) | ||
| 171 | + { | ||
| 172 | + echo str_repeat("\t", $depth) . $this->tag; | ||
| 173 | + | ||
| 174 | + if ($show_attr && count($this->attr) > 0) { | ||
| 175 | + echo '('; | ||
| 176 | + foreach ($this->attr as $k => $v) { | ||
| 177 | + echo "[$k]=>\"$v\", "; | ||
| 178 | + } | ||
| 179 | + echo ')'; | ||
| 180 | + } | ||
| 181 | + | ||
| 182 | + echo "\n"; | ||
| 183 | + | ||
| 184 | + if ($this->nodes) { | ||
| 185 | + foreach ($this->nodes as $node) { | ||
| 186 | + $node->dump($show_attr, $depth + 1); | ||
| 187 | + } | ||
| 188 | + } | ||
| 189 | + } | ||
| 190 | + | ||
| 191 | + function dump_node($echo = true) | ||
| 192 | + { | ||
| 193 | + $string = $this->tag; | ||
| 194 | + | ||
| 195 | + if (count($this->attr) > 0) { | ||
| 196 | + $string .= '('; | ||
| 197 | + foreach ($this->attr as $k => $v) { | ||
| 198 | + $string .= "[$k]=>\"$v\", "; | ||
| 199 | + } | ||
| 200 | + $string .= ')'; | ||
| 201 | + } | ||
| 202 | + | ||
| 203 | + if (count($this->_) > 0) { | ||
| 204 | + $string .= ' $_ ('; | ||
| 205 | + foreach ($this->_ as $k => $v) { | ||
| 206 | + if (is_array($v)) { | ||
| 207 | + $string .= "[$k]=>("; | ||
| 208 | + foreach ($v as $k2 => $v2) { | ||
| 209 | + $string .= "[$k2]=>\"$v2\", "; | ||
| 210 | + } | ||
| 211 | + $string .= ')'; | ||
| 212 | + } else { | ||
| 213 | + $string .= "[$k]=>\"$v\", "; | ||
| 214 | + } | ||
| 215 | + } | ||
| 216 | + $string .= ')'; | ||
| 217 | + } | ||
| 218 | + | ||
| 219 | + if (isset($this->text)) { | ||
| 220 | + $string .= " text: ({$this->text})"; | ||
| 221 | + } | ||
| 222 | + | ||
| 223 | + $string .= ' HDOM_INNER_INFO: '; | ||
| 224 | + | ||
| 225 | + if (isset($node->_[HDOM_INFO_INNER])) { | ||
| 226 | + $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; | ||
| 227 | + } else { | ||
| 228 | + $string .= ' NULL '; | ||
| 229 | + } | ||
| 230 | + | ||
| 231 | + $string .= ' children: ' . count($this->children); | ||
| 232 | + $string .= ' nodes: ' . count($this->nodes); | ||
| 233 | + $string .= ' tag_start: ' . $this->tag_start; | ||
| 234 | + $string .= "\n"; | ||
| 235 | + | ||
| 236 | + if ($echo) { | ||
| 237 | + echo $string; | ||
| 238 | + return; | ||
| 239 | + } else { | ||
| 240 | + return $string; | ||
| 241 | + } | ||
| 242 | + } | ||
| 243 | + | ||
| 244 | + function parent($parent = null) | ||
| 245 | + { | ||
| 246 | + // I am SURE that this doesn't work properly. | ||
| 247 | + // It fails to unset the current node from it's current parents nodes or | ||
| 248 | + // children list first. | ||
| 249 | + if ($parent !== null) { | ||
| 250 | + $this->parent = $parent; | ||
| 251 | + $this->parent->nodes[] = $this; | ||
| 252 | + $this->parent->children[] = $this; | ||
| 253 | + } | ||
| 254 | + | ||
| 255 | + return $this->parent; | ||
| 256 | + } | ||
| 257 | + | ||
| 258 | + function has_child() | ||
| 259 | + { | ||
| 260 | + return !empty($this->children); | ||
| 261 | + } | ||
| 262 | + | ||
| 263 | + function children($idx = -1) | ||
| 264 | + { | ||
| 265 | + if ($idx === -1) { | ||
| 266 | + return $this->children; | ||
| 267 | + } | ||
| 268 | + | ||
| 269 | + if (isset($this->children[$idx])) { | ||
| 270 | + return $this->children[$idx]; | ||
| 271 | + } | ||
| 272 | + | ||
| 273 | + return null; | ||
| 274 | + } | ||
| 275 | + | ||
| 276 | + function first_child() | ||
| 277 | + { | ||
| 278 | + if (count($this->children) > 0) { | ||
| 279 | + return $this->children[0]; | ||
| 280 | + } | ||
| 281 | + return null; | ||
| 282 | + } | ||
| 283 | + | ||
| 284 | + function last_child() | ||
| 285 | + { | ||
| 286 | + if (count($this->children) > 0) { | ||
| 287 | + return end($this->children); | ||
| 288 | + } | ||
| 289 | + return null; | ||
| 290 | + } | ||
| 291 | + | ||
| 292 | + function next_sibling() | ||
| 293 | + { | ||
| 294 | + if ($this->parent === null) { | ||
| 295 | + return null; | ||
| 296 | + } | ||
| 297 | + | ||
| 298 | + $idx = array_search($this, $this->parent->children, true); | ||
| 299 | + | ||
| 300 | + if ($idx !== false && isset($this->parent->children[$idx + 1])) { | ||
| 301 | + return $this->parent->children[$idx + 1]; | ||
| 302 | + } | ||
| 303 | + | ||
| 304 | + return null; | ||
| 305 | + } | ||
| 306 | + | ||
| 307 | + function prev_sibling() | ||
| 308 | + { | ||
| 309 | + if ($this->parent === null) { | ||
| 310 | + return null; | ||
| 311 | + } | ||
| 312 | + | ||
| 313 | + $idx = array_search($this, $this->parent->children, true); | ||
| 314 | + | ||
| 315 | + if ($idx !== false && $idx > 0) { | ||
| 316 | + return $this->parent->children[$idx - 1]; | ||
| 317 | + } | ||
| 318 | + | ||
| 319 | + return null; | ||
| 320 | + } | ||
| 321 | + | ||
| 322 | + function find_ancestor_tag($tag) | ||
| 323 | + { | ||
| 324 | + global $debug_object; | ||
| 325 | + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | ||
| 326 | + | ||
| 327 | + if ($this->parent === null) { | ||
| 328 | + return null; | ||
| 329 | + } | ||
| 330 | + | ||
| 331 | + $ancestor = $this->parent; | ||
| 332 | + | ||
| 333 | + while (!is_null($ancestor)) { | ||
| 334 | + if (is_object($debug_object)) { | ||
| 335 | + $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); | ||
| 336 | + } | ||
| 337 | + | ||
| 338 | + if ($ancestor->tag === $tag) { | ||
| 339 | + break; | ||
| 340 | + } | ||
| 341 | + | ||
| 342 | + $ancestor = $ancestor->parent; | ||
| 343 | + } | ||
| 344 | + | ||
| 345 | + return $ancestor; | ||
| 346 | + } | ||
| 347 | + | ||
| 348 | + function innertext() | ||
| 349 | + { | ||
| 350 | + if (isset($this->_[HDOM_INFO_INNER])) { | ||
| 351 | + return $this->_[HDOM_INFO_INNER]; | ||
| 352 | + } | ||
| 353 | + | ||
| 354 | + if (isset($this->_[HDOM_INFO_TEXT])) { | ||
| 355 | + return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
| 356 | + } | ||
| 357 | + | ||
| 358 | + $ret = ''; | ||
| 359 | + | ||
| 360 | + foreach ($this->nodes as $n) { | ||
| 361 | + $ret .= $n->outertext(); | ||
| 362 | + } | ||
| 363 | + | ||
| 364 | + return $ret; | ||
| 365 | + } | ||
| 366 | + | ||
| 367 | + function outertext() | ||
| 368 | + { | ||
| 369 | + global $debug_object; | ||
| 370 | + | ||
| 371 | + if (is_object($debug_object)) { | ||
| 372 | + $text = ''; | ||
| 373 | + | ||
| 374 | + if ($this->tag === 'text') { | ||
| 375 | + if (!empty($this->text)) { | ||
| 376 | + $text = ' with text: ' . $this->text; | ||
| 377 | + } | ||
| 378 | + } | ||
| 379 | + | ||
| 380 | + $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); | ||
| 381 | + } | ||
| 382 | + | ||
| 383 | + if ($this->tag === 'root') { | ||
| 384 | + return $this->innertext(); | ||
| 385 | + } | ||
| 386 | + | ||
| 387 | + // todo: What is the use of this callback? Remove? | ||
| 388 | + if ($this->dom && $this->dom->callback !== null) { | ||
| 389 | + call_user_func_array($this->dom->callback, array($this)); | ||
| 390 | + } | ||
| 391 | + | ||
| 392 | + if (isset($this->_[HDOM_INFO_OUTER])) { | ||
| 393 | + return $this->_[HDOM_INFO_OUTER]; | ||
| 394 | + } | ||
| 395 | + | ||
| 396 | + if (isset($this->_[HDOM_INFO_TEXT])) { | ||
| 397 | + return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
| 398 | + } | ||
| 399 | + | ||
| 400 | + $ret = ''; | ||
| 401 | + | ||
| 402 | + if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { | ||
| 403 | + $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); | ||
| 404 | + } | ||
| 405 | + | ||
| 406 | + if (isset($this->_[HDOM_INFO_INNER])) { | ||
| 407 | + // todo: <br> should either never have HDOM_INFO_INNER or always | ||
| 408 | + if ($this->tag !== 'br') { | ||
| 409 | + $ret .= $this->_[HDOM_INFO_INNER]; | ||
| 410 | + } | ||
| 411 | + } elseif ($this->nodes) { | ||
| 412 | + foreach ($this->nodes as $n) { | ||
| 413 | + $ret .= $this->convert_text($n->outertext()); | ||
| 414 | + } | ||
| 415 | + } | ||
| 416 | + | ||
| 417 | + if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { | ||
| 418 | + $ret .= '</' . $this->tag . '>'; | ||
| 419 | + } | ||
| 420 | + | ||
| 421 | + return $ret; | ||
| 422 | + } | ||
| 423 | + | ||
| 424 | + function text() | ||
| 425 | + { | ||
| 426 | + if (isset($this->_[HDOM_INFO_INNER])) { | ||
| 427 | + return $this->_[HDOM_INFO_INNER]; | ||
| 428 | + } | ||
| 429 | + | ||
| 430 | + switch ($this->nodetype) { | ||
| 431 | + case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
| 432 | + case HDOM_TYPE_COMMENT: return ''; | ||
| 433 | + case HDOM_TYPE_UNKNOWN: return ''; | ||
| 434 | + } | ||
| 435 | + | ||
| 436 | + if (strcasecmp($this->tag, 'script') === 0) { return ''; } | ||
| 437 | + if (strcasecmp($this->tag, 'style') === 0) { return ''; } | ||
| 438 | + | ||
| 439 | + $ret = ''; | ||
| 440 | + | ||
| 441 | + // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed | ||
| 442 | + // for some span tags, and some p tags) $this->nodes is set to NULL. | ||
| 443 | + // NOTE: This indicates that there is a problem where it's set to NULL | ||
| 444 | + // without a clear happening. | ||
| 445 | + // WHY is this happening? | ||
| 446 | + if (!is_null($this->nodes)) { | ||
| 447 | + foreach ($this->nodes as $n) { | ||
| 448 | + // Start paragraph after a blank line | ||
| 449 | + if ($n->tag === 'p') { | ||
| 450 | + $ret = trim($ret) . "\n\n"; | ||
| 451 | + } | ||
| 452 | + | ||
| 453 | + $ret .= $this->convert_text($n->text()); | ||
| 454 | + | ||
| 455 | + // If this node is a span... add a space at the end of it so | ||
| 456 | + // multiple spans don't run into each other. This is plaintext | ||
| 457 | + // after all. | ||
| 458 | + if ($n->tag === 'span') { | ||
| 459 | + $ret .= $this->dom->default_span_text; | ||
| 460 | + } | ||
| 461 | + } | ||
| 462 | + } | ||
| 463 | + return $ret; | ||
| 464 | + } | ||
| 465 | + | ||
| 466 | + function xmltext() | ||
| 467 | + { | ||
| 468 | + $ret = $this->innertext(); | ||
| 469 | + $ret = str_ireplace('<![CDATA[', '', $ret); | ||
| 470 | + $ret = str_replace(']]>', '', $ret); | ||
| 471 | + return $ret; | ||
| 472 | + } | ||
| 473 | + | ||
| 474 | + function makeup() | ||
| 475 | + { | ||
| 476 | + // text, comment, unknown | ||
| 477 | + if (isset($this->_[HDOM_INFO_TEXT])) { | ||
| 478 | + return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
| 479 | + } | ||
| 480 | + | ||
| 481 | + $ret = '<' . $this->tag; | ||
| 482 | + $i = -1; | ||
| 483 | + | ||
| 484 | + foreach ($this->attr as $key => $val) { | ||
| 485 | + ++$i; | ||
| 486 | + | ||
| 487 | + // skip removed attribute | ||
| 488 | + if ($val === null || $val === false) { continue; } | ||
| 489 | + | ||
| 490 | + $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; | ||
| 491 | + | ||
| 492 | + //no value attr: nowrap, checked selected... | ||
| 493 | + if ($val === true) { | ||
| 494 | + $ret .= $key; | ||
| 495 | + } else { | ||
| 496 | + switch ($this->_[HDOM_INFO_QUOTE][$i]) | ||
| 497 | + { | ||
| 498 | + case HDOM_QUOTE_DOUBLE: $quote = '"'; break; | ||
| 499 | + case HDOM_QUOTE_SINGLE: $quote = '\''; break; | ||
| 500 | + default: $quote = ''; | ||
| 501 | + } | ||
| 502 | + | ||
| 503 | + $ret .= $key | ||
| 504 | + . $this->_[HDOM_INFO_SPACE][$i][1] | ||
| 505 | + . '=' | ||
| 506 | + . $this->_[HDOM_INFO_SPACE][$i][2] | ||
| 507 | + . $quote | ||
| 508 | + . $val | ||
| 509 | + . $quote; | ||
| 510 | + } | ||
| 511 | + } | ||
| 512 | + | ||
| 513 | + $ret = $this->dom->restore_noise($ret); | ||
| 514 | + return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; | ||
| 515 | + } | ||
| 516 | + | ||
| 517 | + function find($selector, $idx = null, $lowercase = false) | ||
| 518 | + { | ||
| 519 | + $selectors = $this->parse_selector($selector); | ||
| 520 | + if (($count = count($selectors)) === 0) { return array(); } | ||
| 521 | + $found_keys = array(); | ||
| 522 | + | ||
| 523 | + // find each selector | ||
| 524 | + for ($c = 0; $c < $count; ++$c) { | ||
| 525 | + // The change on the below line was documented on the sourceforge | ||
| 526 | + // code tracker id 2788009 | ||
| 527 | + // used to be: if (($levle=count($selectors[0]))===0) return array(); | ||
| 528 | + if (($levle = count($selectors[$c])) === 0) { return array(); } | ||
| 529 | + if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } | ||
| 530 | + | ||
| 531 | + $head = array($this->_[HDOM_INFO_BEGIN] => 1); | ||
| 532 | + $cmd = ' '; // Combinator | ||
| 533 | + | ||
| 534 | + // handle descendant selectors, no recursive! | ||
| 535 | + for ($l = 0; $l < $levle; ++$l) { | ||
| 536 | + $ret = array(); | ||
| 537 | + | ||
| 538 | + foreach ($head as $k => $v) { | ||
| 539 | + $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; | ||
| 540 | + //PaperG - Pass this optional parameter on to the seek function. | ||
| 541 | + $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); | ||
| 542 | + } | ||
| 543 | + | ||
| 544 | + $head = $ret; | ||
| 545 | + $cmd = $selectors[$c][$l][4]; // Next Combinator | ||
| 546 | + } | ||
| 547 | + | ||
| 548 | + foreach ($head as $k => $v) { | ||
| 549 | + if (!isset($found_keys[$k])) { | ||
| 550 | + $found_keys[$k] = 1; | ||
| 551 | + } | ||
| 552 | + } | ||
| 553 | + } | ||
| 554 | + | ||
| 555 | + // sort keys | ||
| 556 | + ksort($found_keys); | ||
| 557 | + | ||
| 558 | + $found = array(); | ||
| 559 | + foreach ($found_keys as $k => $v) { | ||
| 560 | + $found[] = $this->dom->nodes[$k]; | ||
| 561 | + } | ||
| 562 | + | ||
| 563 | + // return nth-element or array | ||
| 564 | + if (is_null($idx)) { return $found; } | ||
| 565 | + elseif ($idx < 0) { $idx = count($found) + $idx; } | ||
| 566 | + return (isset($found[$idx])) ? $found[$idx] : null; | ||
| 567 | + } | ||
| 568 | + | ||
| 569 | + protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) | ||
| 570 | + { | ||
| 571 | + global $debug_object; | ||
| 572 | + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | ||
| 573 | + | ||
| 574 | + list($tag, $id, $class, $attributes, $cmb) = $selector; | ||
| 575 | + $nodes = array(); | ||
| 576 | + | ||
| 577 | + if ($parent_cmd === ' ') { // Descendant Combinator | ||
| 578 | + // Find parent closing tag if the current element doesn't have a closing | ||
| 579 | + // tag (i.e. void element) | ||
| 580 | + $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; | ||
| 581 | + if ($end == 0) { | ||
| 582 | + $parent = $this->parent; | ||
| 583 | + while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { | ||
| 584 | + $end -= 1; | ||
| 585 | + $parent = $parent->parent; | ||
| 586 | + } | ||
| 587 | + $end += $parent->_[HDOM_INFO_END]; | ||
| 588 | + } | ||
| 589 | + | ||
| 590 | + // Get list of target nodes | ||
| 591 | + $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; | ||
| 592 | + $nodes_count = $end - $nodes_start; | ||
| 593 | + $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); | ||
| 594 | + } elseif ($parent_cmd === '>') { // Child Combinator | ||
| 595 | + $nodes = $this->children; | ||
| 596 | + } elseif ($parent_cmd === '+' | ||
| 597 | + && $this->parent | ||
| 598 | + && in_array($this, $this->parent->children)) { // Next-Sibling Combinator | ||
| 599 | + $index = array_search($this, $this->parent->children, true) + 1; | ||
| 600 | + if ($index < count($this->parent->children)) | ||
| 601 | + $nodes[] = $this->parent->children[$index]; | ||
| 602 | + } elseif ($parent_cmd === '~' | ||
| 603 | + && $this->parent | ||
| 604 | + && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator | ||
| 605 | + $index = array_search($this, $this->parent->children, true); | ||
| 606 | + $nodes = array_slice($this->parent->children, $index); | ||
| 607 | + } | ||
| 608 | + | ||
| 609 | + // Go throgh each element starting at this element until the end tag | ||
| 610 | + // Note: If this element is a void tag, any previous void element is | ||
| 611 | + // skipped. | ||
| 612 | + foreach($nodes as $node) { | ||
| 613 | + $pass = true; | ||
| 614 | + | ||
| 615 | + // Skip root nodes | ||
| 616 | + if(!$node->parent) { | ||
| 617 | + $pass = false; | ||
| 618 | + } | ||
| 619 | + | ||
| 620 | + // Handle 'text' selector | ||
| 621 | + if($pass && $tag === 'text' && $node->tag === 'text') { | ||
| 622 | + $ret[array_search($node, $this->dom->nodes, true)] = 1; | ||
| 623 | + unset($node); | ||
| 624 | + continue; | ||
| 625 | + } | ||
| 626 | + | ||
| 627 | + // Skip if node isn't a child node (i.e. text nodes) | ||
| 628 | + if($pass && !in_array($node, $node->parent->children, true)) { | ||
| 629 | + $pass = false; | ||
| 630 | + } | ||
| 631 | + | ||
| 632 | + // Skip if tag doesn't match | ||
| 633 | + if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { | ||
| 634 | + $pass = false; | ||
| 635 | + } | ||
| 636 | + | ||
| 637 | + // Skip if ID doesn't exist | ||
| 638 | + if ($pass && $id !== '' && !isset($node->attr['id'])) { | ||
| 639 | + $pass = false; | ||
| 640 | + } | ||
| 641 | + | ||
| 642 | + // Check if ID matches | ||
| 643 | + if ($pass && $id !== '' && isset($node->attr['id'])) { | ||
| 644 | + // Note: Only consider the first ID (as browsers do) | ||
| 645 | + $node_id = explode(' ', trim($node->attr['id']))[0]; | ||
| 646 | + | ||
| 647 | + if($id !== $node_id) { $pass = false; } | ||
| 648 | + } | ||
| 649 | + | ||
| 650 | + // Check if all class(es) exist | ||
| 651 | + if ($pass && $class !== '' && is_array($class) && !empty($class)) { | ||
| 652 | + if (isset($node->attr['class'])) { | ||
| 653 | + $node_classes = explode(' ', $node->attr['class']); | ||
| 654 | + | ||
| 655 | + if ($lowercase) { | ||
| 656 | + $node_classes = array_map('strtolower', $node_classes); | ||
| 657 | + } | ||
| 658 | + | ||
| 659 | + foreach($class as $c) { | ||
| 660 | + if(!in_array($c, $node_classes)) { | ||
| 661 | + $pass = false; | ||
| 662 | + break; | ||
| 663 | + } | ||
| 664 | + } | ||
| 665 | + } else { | ||
| 666 | + $pass = false; | ||
| 667 | + } | ||
| 668 | + } | ||
| 669 | + | ||
| 670 | + // Check attributes | ||
| 671 | + if ($pass | ||
| 672 | + && $attributes !== '' | ||
| 673 | + && is_array($attributes) | ||
| 674 | + && !empty($attributes)) { | ||
| 675 | + foreach($attributes as $a) { | ||
| 676 | + list ( | ||
| 677 | + $att_name, | ||
| 678 | + $att_expr, | ||
| 679 | + $att_val, | ||
| 680 | + $att_inv, | ||
| 681 | + $att_case_sensitivity | ||
| 682 | + ) = $a; | ||
| 683 | + | ||
| 684 | + // Handle indexing attributes (i.e. "[2]") | ||
| 685 | + /** | ||
| 686 | + * Note: This is not supported by the CSS Standard but adds | ||
| 687 | + * the ability to select items compatible to XPath (i.e. | ||
| 688 | + * the 3rd element within it's parent). | ||
| 689 | + * | ||
| 690 | + * Note: This doesn't conflict with the CSS Standard which | ||
| 691 | + * doesn't work on numeric attributes anyway. | ||
| 692 | + */ | ||
| 693 | + if (is_numeric($att_name) | ||
| 694 | + && $att_expr === '' | ||
| 695 | + && $att_val === '') { | ||
| 696 | + $count = 0; | ||
| 697 | + | ||
| 698 | + // Find index of current element in parent | ||
| 699 | + foreach ($node->parent->children as $c) { | ||
| 700 | + if ($c->tag === $node->tag) ++$count; | ||
| 701 | + if ($c === $node) break; | ||
| 702 | + } | ||
| 703 | + | ||
| 704 | + // If this is the correct node, continue with next | ||
| 705 | + // attribute | ||
| 706 | + if ($count === (int)$att_name) continue; | ||
| 707 | + } | ||
| 708 | + | ||
| 709 | + // Check attribute availability | ||
| 710 | + if ($att_inv) { // Attribute should NOT be set | ||
| 711 | + if (isset($node->attr[$att_name])) { | ||
| 712 | + $pass = false; | ||
| 713 | + break; | ||
| 714 | + } | ||
| 715 | + } else { // Attribute should be set | ||
| 716 | + // todo: "plaintext" is not a valid CSS selector! | ||
| 717 | + if ($att_name !== 'plaintext' | ||
| 718 | + && !isset($node->attr[$att_name])) { | ||
| 719 | + $pass = false; | ||
| 720 | + break; | ||
| 721 | + } | ||
| 722 | + } | ||
| 723 | + | ||
| 724 | + // Continue with next attribute if expression isn't defined | ||
| 725 | + if ($att_expr === '') continue; | ||
| 726 | + | ||
| 727 | + // If they have told us that this is a "plaintext" | ||
| 728 | + // search then we want the plaintext of the node - right? | ||
| 729 | + // todo "plaintext" is not a valid CSS selector! | ||
| 730 | + if ($att_name === 'plaintext') { | ||
| 731 | + $nodeKeyValue = $node->text(); | ||
| 732 | + } else { | ||
| 733 | + $nodeKeyValue = $node->attr[$att_name]; | ||
| 734 | + } | ||
| 735 | + | ||
| 736 | + if (is_object($debug_object)) { | ||
| 737 | + $debug_object->debug_log(2, | ||
| 738 | + 'testing node: ' | ||
| 739 | + . $node->tag | ||
| 740 | + . ' for attribute: ' | ||
| 741 | + . $att_name | ||
| 742 | + . $att_expr | ||
| 743 | + . $att_val | ||
| 744 | + . ' where nodes value is: ' | ||
| 745 | + . $nodeKeyValue | ||
| 746 | + ); | ||
| 747 | + } | ||
| 748 | + | ||
| 749 | + // If lowercase is set, do a case insensitive test of | ||
| 750 | + // the value of the selector. | ||
| 751 | + if ($lowercase) { | ||
| 752 | + $check = $this->match( | ||
| 753 | + $att_expr, | ||
| 754 | + strtolower($att_val), | ||
| 755 | + strtolower($nodeKeyValue), | ||
| 756 | + $att_case_sensitivity | ||
| 757 | + ); | ||
| 758 | + } else { | ||
| 759 | + $check = $this->match( | ||
| 760 | + $att_expr, | ||
| 761 | + $att_val, | ||
| 762 | + $nodeKeyValue, | ||
| 763 | + $att_case_sensitivity | ||
| 764 | + ); | ||
| 765 | + } | ||
| 766 | + | ||
| 767 | + if (is_object($debug_object)) { | ||
| 768 | + $debug_object->debug_log(2, | ||
| 769 | + 'after match: ' | ||
| 770 | + . ($check ? 'true' : 'false') | ||
| 771 | + ); | ||
| 772 | + } | ||
| 773 | + | ||
| 774 | + if (!$check) { | ||
| 775 | + $pass = false; | ||
| 776 | + break; | ||
| 777 | + } | ||
| 778 | + } | ||
| 779 | + } | ||
| 780 | + | ||
| 781 | + // Found a match. Add to list and clear node | ||
| 782 | + if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; | ||
| 783 | + unset($node); | ||
| 784 | + } | ||
| 785 | + // It's passed by reference so this is actually what this function returns. | ||
| 786 | + if (is_object($debug_object)) { | ||
| 787 | + $debug_object->debug_log(1, 'EXIT - ret: ', $ret); | ||
| 788 | + } | ||
| 789 | + } | ||
| 790 | + | ||
| 791 | + protected function match($exp, $pattern, $value, $case_sensitivity) | ||
| 792 | + { | ||
| 793 | + global $debug_object; | ||
| 794 | + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} | ||
| 795 | + | ||
| 796 | + if ($case_sensitivity === 'i') { | ||
| 797 | + $pattern = strtolower($pattern); | ||
| 798 | + $value = strtolower($value); | ||
| 799 | + } | ||
| 800 | + | ||
| 801 | + switch ($exp) { | ||
| 802 | + case '=': | ||
| 803 | + return ($value === $pattern); | ||
| 804 | + case '!=': | ||
| 805 | + return ($value !== $pattern); | ||
| 806 | + case '^=': | ||
| 807 | + return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); | ||
| 808 | + case '$=': | ||
| 809 | + return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); | ||
| 810 | + case '*=': | ||
| 811 | + return preg_match('/' . preg_quote($pattern, '/') . '/', $value); | ||
| 812 | + case '|=': | ||
| 813 | + /** | ||
| 814 | + * [att|=val] | ||
| 815 | + * | ||
| 816 | + * Represents an element with the att attribute, its value | ||
| 817 | + * either being exactly "val" or beginning with "val" | ||
| 818 | + * immediately followed by "-" (U+002D). | ||
| 819 | + */ | ||
| 820 | + return strpos($value, $pattern) === 0; | ||
| 821 | + case '~=': | ||
| 822 | + /** | ||
| 823 | + * [att~=val] | ||
| 824 | + * | ||
| 825 | + * Represents an element with the att attribute whose value is a | ||
| 826 | + * whitespace-separated list of words, one of which is exactly | ||
| 827 | + * "val". If "val" contains whitespace, it will never represent | ||
| 828 | + * anything (since the words are separated by spaces). Also if | ||
| 829 | + * "val" is the empty string, it will never represent anything. | ||
| 830 | + */ | ||
| 831 | + return in_array($pattern, explode(' ', trim($value)), true); | ||
| 832 | + } | ||
| 833 | + return false; | ||
| 834 | + } | ||
| 835 | + | ||
| 836 | + protected function parse_selector($selector_string) | ||
| 837 | + { | ||
| 838 | + global $debug_object; | ||
| 839 | + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | ||
| 840 | + | ||
| 841 | + /** | ||
| 842 | + * Pattern of CSS selectors, modified from mootools (https://mootools.net/) | ||
| 843 | + * | ||
| 844 | + * Paperg: Add the colon to the attribute, so that it properly finds | ||
| 845 | + * <tag attr:ibute="something" > like google does. | ||
| 846 | + * | ||
| 847 | + * Note: if you try to look at this attribute, you MUST use getAttribute | ||
| 848 | + * since $dom->x:y will fail the php syntax check. | ||
| 849 | + * | ||
| 850 | + * Notice the \[ starting the attribute? and the @? following? This | ||
| 851 | + * implies that an attribute can begin with an @ sign that is not | ||
| 852 | + * captured. This implies that an html attribute specifier may start | ||
| 853 | + * with an @ sign that is NOT captured by the expression. Farther study | ||
| 854 | + * is required to determine of this should be documented or removed. | ||
| 855 | + * | ||
| 856 | + * Matches selectors in this order: | ||
| 857 | + * | ||
| 858 | + * [0] - full match | ||
| 859 | + * | ||
| 860 | + * [1] - tag name | ||
| 861 | + * ([\w:\*-]*) | ||
| 862 | + * Matches the tag name consisting of zero or more words, colons, | ||
| 863 | + * asterisks and hyphens. | ||
| 864 | + * | ||
| 865 | + * [2] - id name | ||
| 866 | + * (?:\#([\w-]+)) | ||
| 867 | + * Optionally matches a id name, consisting of an "#" followed by | ||
| 868 | + * the id name (one or more words and hyphens). | ||
| 869 | + * | ||
| 870 | + * [3] - class names (including dots) | ||
| 871 | + * (?:\.([\w\.-]+))? | ||
| 872 | + * Optionally matches a list of classs, consisting of an "." | ||
| 873 | + * followed by the class name (one or more words and hyphens) | ||
| 874 | + * where multiple classes can be chained (i.e. ".foo.bar.baz") | ||
| 875 | + * | ||
| 876 | + * [4] - attributes | ||
| 877 | + * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? | ||
| 878 | + * Optionally matches the attributes list | ||
| 879 | + * | ||
| 880 | + * [5] - separator | ||
| 881 | + * ([\/, >+~]+) | ||
| 882 | + * Matches the selector list separator | ||
| 883 | + */ | ||
| 884 | + // phpcs:ignore Generic.Files.LineLength | ||
| 885 | + $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; | ||
| 886 | + | ||
| 887 | + preg_match_all( | ||
| 888 | + $pattern, | ||
| 889 | + trim($selector_string) . ' ', // Add final ' ' as pseudo separator | ||
| 890 | + $matches, | ||
| 891 | + PREG_SET_ORDER | ||
| 892 | + ); | ||
| 893 | + | ||
| 894 | + if (is_object($debug_object)) { | ||
| 895 | + $debug_object->debug_log(2, 'Matches Array: ', $matches); | ||
| 896 | + } | ||
| 897 | + | ||
| 898 | + $selectors = array(); | ||
| 899 | + $result = array(); | ||
| 900 | + | ||
| 901 | + foreach ($matches as $m) { | ||
| 902 | + $m[0] = trim($m[0]); | ||
| 903 | + | ||
| 904 | + // Skip NoOps | ||
| 905 | + if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } | ||
| 906 | + | ||
| 907 | + // Convert to lowercase | ||
| 908 | + if ($this->dom->lowercase) { | ||
| 909 | + $m[1] = strtolower($m[1]); | ||
| 910 | + } | ||
| 911 | + | ||
| 912 | + // Extract classes | ||
| 913 | + if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } | ||
| 914 | + | ||
| 915 | + /* Extract attributes (pattern based on the pattern above!) | ||
| 916 | + | ||
| 917 | + * [0] - full match | ||
| 918 | + * [1] - attribute name | ||
| 919 | + * [2] - attribute expression | ||
| 920 | + * [3] - attribute value | ||
| 921 | + * [4] - case sensitivity | ||
| 922 | + * | ||
| 923 | + * Note: Attributes can be negated with a "!" prefix to their name | ||
| 924 | + */ | ||
| 925 | + if($m[4] !== '') { | ||
| 926 | + preg_match_all( | ||
| 927 | + "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", | ||
| 928 | + trim($m[4]), | ||
| 929 | + $attributes, | ||
| 930 | + PREG_SET_ORDER | ||
| 931 | + ); | ||
| 932 | + | ||
| 933 | + // Replace element by array | ||
| 934 | + $m[4] = array(); | ||
| 935 | + | ||
| 936 | + foreach($attributes as $att) { | ||
| 937 | + // Skip empty matches | ||
| 938 | + if(trim($att[0]) === '') { continue; } | ||
| 939 | + | ||
| 940 | + $inverted = (isset($att[1][0]) && $att[1][0] === '!'); | ||
| 941 | + $m[4][] = array( | ||
| 942 | + $inverted ? substr($att[1], 1) : $att[1], // Name | ||
| 943 | + (isset($att[2])) ? $att[2] : '', // Expression | ||
| 944 | + (isset($att[3])) ? $att[3] : '', // Value | ||
| 945 | + $inverted, // Inverted Flag | ||
| 946 | + (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity | ||
| 947 | + ); | ||
| 948 | + } | ||
| 949 | + } | ||
| 950 | + | ||
| 951 | + // Sanitize Separator | ||
| 952 | + if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator | ||
| 953 | + $m[5] = ' '; | ||
| 954 | + } else { // Other Separator | ||
| 955 | + $m[5] = trim($m[5]); | ||
| 956 | + } | ||
| 957 | + | ||
| 958 | + // Clear Separator if it's a Selector List | ||
| 959 | + if ($is_list = ($m[5] === ',')) { $m[5] = ''; } | ||
| 960 | + | ||
| 961 | + // Remove full match before adding to results | ||
| 962 | + array_shift($m); | ||
| 963 | + $result[] = $m; | ||
| 964 | + | ||
| 965 | + if ($is_list) { // Selector List | ||
| 966 | + $selectors[] = $result; | ||
| 967 | + $result = array(); | ||
| 968 | + } | ||
| 969 | + } | ||
| 970 | + | ||
| 971 | + if (count($result) > 0) { $selectors[] = $result; } | ||
| 972 | + return $selectors; | ||
| 973 | + } | ||
| 974 | + | ||
| 975 | + function __get($name) | ||
| 976 | + { | ||
| 977 | + if (isset($this->attr[$name])) { | ||
| 978 | + return $this->convert_text($this->attr[$name]); | ||
| 979 | + } | ||
| 980 | + switch ($name) { | ||
| 981 | + case 'outertext': return $this->outertext(); | ||
| 982 | + case 'innertext': return $this->innertext(); | ||
| 983 | + case 'plaintext': return $this->text(); | ||
| 984 | + case 'xmltext': return $this->xmltext(); | ||
| 985 | + default: return array_key_exists($name, $this->attr); | ||
| 986 | + } | ||
| 987 | + } | ||
| 988 | + | ||
| 989 | + function __set($name, $value) | ||
| 990 | + { | ||
| 991 | + global $debug_object; | ||
| 992 | + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | ||
| 993 | + | ||
| 994 | + switch ($name) { | ||
| 995 | + case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; | ||
| 996 | + case 'innertext': | ||
| 997 | + if (isset($this->_[HDOM_INFO_TEXT])) { | ||
| 998 | + return $this->_[HDOM_INFO_TEXT] = $value; | ||
| 999 | + } | ||
| 1000 | + return $this->_[HDOM_INFO_INNER] = $value; | ||
| 1001 | + } | ||
| 1002 | + | ||
| 1003 | + if (!isset($this->attr[$name])) { | ||
| 1004 | + $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); | ||
| 1005 | + $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; | ||
| 1006 | + } | ||
| 1007 | + | ||
| 1008 | + $this->attr[$name] = $value; | ||
| 1009 | + } | ||
| 1010 | + | ||
| 1011 | + function __isset($name) | ||
| 1012 | + { | ||
| 1013 | + switch ($name) { | ||
| 1014 | + case 'outertext': return true; | ||
| 1015 | + case 'innertext': return true; | ||
| 1016 | + case 'plaintext': return true; | ||
| 1017 | + } | ||
| 1018 | + //no value attr: nowrap, checked selected... | ||
| 1019 | + return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); | ||
| 1020 | + } | ||
| 1021 | + | ||
| 1022 | + function __unset($name) | ||
| 1023 | + { | ||
| 1024 | + if (isset($this->attr[$name])) { unset($this->attr[$name]); } | ||
| 1025 | + } | ||
| 1026 | + | ||
| 1027 | + function convert_text($text) | ||
| 1028 | + { | ||
| 1029 | + global $debug_object; | ||
| 1030 | + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | ||
| 1031 | + | ||
| 1032 | + $converted_text = $text; | ||
| 1033 | + | ||
| 1034 | + $sourceCharset = ''; | ||
| 1035 | + $targetCharset = ''; | ||
| 1036 | + | ||
| 1037 | + if ($this->dom) { | ||
| 1038 | + $sourceCharset = strtoupper($this->dom->_charset); | ||
| 1039 | + $targetCharset = strtoupper($this->dom->_target_charset); | ||
| 1040 | + } | ||
| 1041 | + | ||
| 1042 | + if (is_object($debug_object)) { | ||
| 1043 | + $debug_object->debug_log(3, | ||
| 1044 | + 'source charset: ' | ||
| 1045 | + . $sourceCharset | ||
| 1046 | + . ' target charaset: ' | ||
| 1047 | + . $targetCharset | ||
| 1048 | + ); | ||
| 1049 | + } | ||
| 1050 | + | ||
| 1051 | + if (!empty($sourceCharset) | ||
| 1052 | + && !empty($targetCharset) | ||
| 1053 | + && (strcasecmp($sourceCharset, $targetCharset) != 0)) { | ||
| 1054 | + // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 | ||
| 1055 | + if ((strcasecmp($targetCharset, 'UTF-8') == 0) | ||
| 1056 | + && ($this->is_utf8($text))) { | ||
| 1057 | + $converted_text = $text; | ||
| 1058 | + } else { | ||
| 1059 | + $converted_text = iconv($sourceCharset, $targetCharset, $text); | ||
| 1060 | + } | ||
| 1061 | + } | ||
| 1062 | + | ||
| 1063 | + // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. | ||
| 1064 | + if ($targetCharset === 'UTF-8') { | ||
| 1065 | + if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { | ||
| 1066 | + $converted_text = substr($converted_text, 3); | ||
| 1067 | + } | ||
| 1068 | + | ||
| 1069 | + if (substr($converted_text, -3) === "\xef\xbb\xbf") { | ||
| 1070 | + $converted_text = substr($converted_text, 0, -3); | ||
| 1071 | + } | ||
| 1072 | + } | ||
| 1073 | + | ||
| 1074 | + return $converted_text; | ||
| 1075 | + } | ||
| 1076 | + | ||
| 1077 | + static function is_utf8($str) | ||
| 1078 | + { | ||
| 1079 | + $c = 0; $b = 0; | ||
| 1080 | + $bits = 0; | ||
| 1081 | + $len = strlen($str); | ||
| 1082 | + for($i = 0; $i < $len; $i++) { | ||
| 1083 | + $c = ord($str[$i]); | ||
| 1084 | + if($c > 128) { | ||
| 1085 | + if(($c >= 254)) { return false; } | ||
| 1086 | + elseif($c >= 252) { $bits = 6; } | ||
| 1087 | + elseif($c >= 248) { $bits = 5; } | ||
| 1088 | + elseif($c >= 240) { $bits = 4; } | ||
| 1089 | + elseif($c >= 224) { $bits = 3; } | ||
| 1090 | + elseif($c >= 192) { $bits = 2; } | ||
| 1091 | + else { return false; } | ||
| 1092 | + if(($i + $bits) > $len) { return false; } | ||
| 1093 | + while($bits > 1) { | ||
| 1094 | + $i++; | ||
| 1095 | + $b = ord($str[$i]); | ||
| 1096 | + if($b < 128 || $b > 191) { return false; } | ||
| 1097 | + $bits--; | ||
| 1098 | + } | ||
| 1099 | + } | ||
| 1100 | + } | ||
| 1101 | + return true; | ||
| 1102 | + } | ||
| 1103 | + | ||
| 1104 | + function get_display_size() | ||
| 1105 | + { | ||
| 1106 | + global $debug_object; | ||
| 1107 | + | ||
| 1108 | + $width = -1; | ||
| 1109 | + $height = -1; | ||
| 1110 | + | ||
| 1111 | + if ($this->tag !== 'img') { | ||
| 1112 | + return false; | ||
| 1113 | + } | ||
| 1114 | + | ||
| 1115 | + // See if there is aheight or width attribute in the tag itself. | ||
| 1116 | + if (isset($this->attr['width'])) { | ||
| 1117 | + $width = $this->attr['width']; | ||
| 1118 | + } | ||
| 1119 | + | ||
| 1120 | + if (isset($this->attr['height'])) { | ||
| 1121 | + $height = $this->attr['height']; | ||
| 1122 | + } | ||
| 1123 | + | ||
| 1124 | + // Now look for an inline style. | ||
| 1125 | + if (isset($this->attr['style'])) { | ||
| 1126 | + // Thanks to user gnarf from stackoverflow for this regular expression. | ||
| 1127 | + $attributes = array(); | ||
| 1128 | + | ||
| 1129 | + preg_match_all( | ||
| 1130 | + '/([\w-]+)\s*:\s*([^;]+)\s*;?/', | ||
| 1131 | + $this->attr['style'], | ||
| 1132 | + $matches, | ||
| 1133 | + PREG_SET_ORDER | ||
| 1134 | + ); | ||
| 1135 | + | ||
| 1136 | + foreach ($matches as $match) { | ||
| 1137 | + $attributes[$match[1]] = $match[2]; | ||
| 1138 | + } | ||
| 1139 | + | ||
| 1140 | + // If there is a width in the style attributes: | ||
| 1141 | + if (isset($attributes['width']) && $width == -1) { | ||
| 1142 | + // check that the last two characters are px (pixels) | ||
| 1143 | + if (strtolower(substr($attributes['width'], -2)) === 'px') { | ||
| 1144 | + $proposed_width = substr($attributes['width'], 0, -2); | ||
| 1145 | + // Now make sure that it's an integer and not something stupid. | ||
| 1146 | + if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { | ||
| 1147 | + $width = $proposed_width; | ||
| 1148 | + } | ||
| 1149 | + } | ||
| 1150 | + } | ||
| 1151 | + | ||
| 1152 | + // If there is a width in the style attributes: | ||
| 1153 | + if (isset($attributes['height']) && $height == -1) { | ||
| 1154 | + // check that the last two characters are px (pixels) | ||
| 1155 | + if (strtolower(substr($attributes['height'], -2)) == 'px') { | ||
| 1156 | + $proposed_height = substr($attributes['height'], 0, -2); | ||
| 1157 | + // Now make sure that it's an integer and not something stupid. | ||
| 1158 | + if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { | ||
| 1159 | + $height = $proposed_height; | ||
| 1160 | + } | ||
| 1161 | + } | ||
| 1162 | + } | ||
| 1163 | + | ||
| 1164 | + } | ||
| 1165 | + | ||
| 1166 | + // Future enhancement: | ||
| 1167 | + // Look in the tag to see if there is a class or id specified that has | ||
| 1168 | + // a height or width attribute to it. | ||
| 1169 | + | ||
| 1170 | + // Far future enhancement | ||
| 1171 | + // Look at all the parent tags of this image to see if they specify a | ||
| 1172 | + // class or id that has an img selector that specifies a height or width | ||
| 1173 | + // Note that in this case, the class or id will have the img subselector | ||
| 1174 | + // for it to apply to the image. | ||
| 1175 | + | ||
| 1176 | + // ridiculously far future development | ||
| 1177 | + // If the class or id is specified in a SEPARATE css file thats not on | ||
| 1178 | + // the page, go get it and do what we were just doing for the ones on | ||
| 1179 | + // the page. | ||
| 1180 | + | ||
| 1181 | + $result = array( | ||
| 1182 | + 'height' => $height, | ||
| 1183 | + 'width' => $width | ||
| 1184 | + ); | ||
| 1185 | + | ||
| 1186 | + return $result; | ||
| 1187 | + } | ||
| 1188 | + | ||
| 1189 | + function save($filepath = '') | ||
| 1190 | + { | ||
| 1191 | + $ret = $this->outertext(); | ||
| 1192 | + | ||
| 1193 | + if ($filepath !== '') { | ||
| 1194 | + file_put_contents($filepath, $ret, LOCK_EX); | ||
| 1195 | + } | ||
| 1196 | + | ||
| 1197 | + return $ret; | ||
| 1198 | + } | ||
| 1199 | + | ||
| 1200 | + function addClass($class) | ||
| 1201 | + { | ||
| 1202 | + if (is_string($class)) { | ||
| 1203 | + $class = explode(' ', $class); | ||
| 1204 | + } | ||
| 1205 | + | ||
| 1206 | + if (is_array($class)) { | ||
| 1207 | + foreach($class as $c) { | ||
| 1208 | + if (isset($this->class)) { | ||
| 1209 | + if ($this->hasClass($c)) { | ||
| 1210 | + continue; | ||
| 1211 | + } else { | ||
| 1212 | + $this->class .= ' ' . $c; | ||
| 1213 | + } | ||
| 1214 | + } else { | ||
| 1215 | + $this->class = $c; | ||
| 1216 | + } | ||
| 1217 | + } | ||
| 1218 | + } else { | ||
| 1219 | + if (is_object($debug_object)) { | ||
| 1220 | + $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); | ||
| 1221 | + } | ||
| 1222 | + } | ||
| 1223 | + } | ||
| 1224 | + | ||
| 1225 | + function hasClass($class) | ||
| 1226 | + { | ||
| 1227 | + if (is_string($class)) { | ||
| 1228 | + if (isset($this->class)) { | ||
| 1229 | + return in_array($class, explode(' ', $this->class), true); | ||
| 1230 | + } | ||
| 1231 | + } else { | ||
| 1232 | + if (is_object($debug_object)) { | ||
| 1233 | + $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); | ||
| 1234 | + } | ||
| 1235 | + } | ||
| 1236 | + | ||
| 1237 | + return false; | ||
| 1238 | + } | ||
| 1239 | + | ||
| 1240 | + function removeClass($class = null) | ||
| 1241 | + { | ||
| 1242 | + if (!isset($this->class)) { | ||
| 1243 | + return; | ||
| 1244 | + } | ||
| 1245 | + | ||
| 1246 | + if (is_null($class)) { | ||
| 1247 | + $this->removeAttribute('class'); | ||
| 1248 | + return; | ||
| 1249 | + } | ||
| 1250 | + | ||
| 1251 | + if (is_string($class)) { | ||
| 1252 | + $class = explode(' ', $class); | ||
| 1253 | + } | ||
| 1254 | + | ||
| 1255 | + if (is_array($class)) { | ||
| 1256 | + $class = array_diff(explode(' ', $this->class), $class); | ||
| 1257 | + if (empty($class)) { | ||
| 1258 | + $this->removeAttribute('class'); | ||
| 1259 | + } else { | ||
| 1260 | + $this->class = implode(' ', $class); | ||
| 1261 | + } | ||
| 1262 | + } | ||
| 1263 | + } | ||
| 1264 | + | ||
| 1265 | + function getAllAttributes() | ||
| 1266 | + { | ||
| 1267 | + return $this->attr; | ||
| 1268 | + } | ||
| 1269 | + | ||
| 1270 | + function getAttribute($name) | ||
| 1271 | + { | ||
| 1272 | + return $this->__get($name); | ||
| 1273 | + } | ||
| 1274 | + | ||
| 1275 | + function setAttribute($name, $value) | ||
| 1276 | + { | ||
| 1277 | + $this->__set($name, $value); | ||
| 1278 | + } | ||
| 1279 | + | ||
| 1280 | + function hasAttribute($name) | ||
| 1281 | + { | ||
| 1282 | + return $this->__isset($name); | ||
| 1283 | + } | ||
| 1284 | + | ||
| 1285 | + function removeAttribute($name) | ||
| 1286 | + { | ||
| 1287 | + $this->__set($name, null); | ||
| 1288 | + } | ||
| 1289 | + | ||
| 1290 | + function remove() | ||
| 1291 | + { | ||
| 1292 | + if ($this->parent) { | ||
| 1293 | + $this->parent->removeChild($this); | ||
| 1294 | + } | ||
| 1295 | + } | ||
| 1296 | + | ||
| 1297 | + function removeChild($node) | ||
| 1298 | + { | ||
| 1299 | + $nidx = array_search($node, $this->nodes, true); | ||
| 1300 | + $cidx = array_search($node, $this->children, true); | ||
| 1301 | + $didx = array_search($node, $this->dom->nodes, true); | ||
| 1302 | + | ||
| 1303 | + if ($nidx !== false && $cidx !== false && $didx !== false) { | ||
| 1304 | + | ||
| 1305 | + foreach($node->children as $child) { | ||
| 1306 | + $node->removeChild($child); | ||
| 1307 | + } | ||
| 1308 | + | ||
| 1309 | + foreach($node->nodes as $entity) { | ||
| 1310 | + $enidx = array_search($entity, $node->nodes, true); | ||
| 1311 | + $edidx = array_search($entity, $node->dom->nodes, true); | ||
| 1312 | + | ||
| 1313 | + if ($enidx !== false && $edidx !== false) { | ||
| 1314 | + unset($node->nodes[$enidx]); | ||
| 1315 | + unset($node->dom->nodes[$edidx]); | ||
| 1316 | + } | ||
| 1317 | + } | ||
| 1318 | + | ||
| 1319 | + unset($this->nodes[$nidx]); | ||
| 1320 | + unset($this->children[$cidx]); | ||
| 1321 | + unset($this->dom->nodes[$didx]); | ||
| 1322 | + | ||
| 1323 | + $node->clear(); | ||
| 1324 | + | ||
| 1325 | + } | ||
| 1326 | + } | ||
| 1327 | + | ||
| 1328 | + function getElementById($id) | ||
| 1329 | + { | ||
| 1330 | + return $this->find("#$id", 0); | ||
| 1331 | + } | ||
| 1332 | + | ||
| 1333 | + function getElementsById($id, $idx = null) | ||
| 1334 | + { | ||
| 1335 | + return $this->find("#$id", $idx); | ||
| 1336 | + } | ||
| 1337 | + | ||
| 1338 | + function getElementByTagName($name) | ||
| 1339 | + { | ||
| 1340 | + return $this->find($name, 0); | ||
| 1341 | + } | ||
| 1342 | + | ||
| 1343 | + function getElementsByTagName($name, $idx = null) | ||
| 1344 | + { | ||
| 1345 | + return $this->find($name, $idx); | ||
| 1346 | + } | ||
| 1347 | + | ||
| 1348 | + function parentNode() | ||
| 1349 | + { | ||
| 1350 | + return $this->parent(); | ||
| 1351 | + } | ||
| 1352 | + | ||
| 1353 | + function childNodes($idx = -1) | ||
| 1354 | + { | ||
| 1355 | + return $this->children($idx); | ||
| 1356 | + } | ||
| 1357 | + | ||
| 1358 | + function firstChild() | ||
| 1359 | + { | ||
| 1360 | + return $this->first_child(); | ||
| 1361 | + } | ||
| 1362 | + | ||
| 1363 | + function lastChild() | ||
| 1364 | + { | ||
| 1365 | + return $this->last_child(); | ||
| 1366 | + } | ||
| 1367 | + | ||
| 1368 | + function nextSibling() | ||
| 1369 | + { | ||
| 1370 | + return $this->next_sibling(); | ||
| 1371 | + } | ||
| 1372 | + | ||
| 1373 | + function previousSibling() | ||
| 1374 | + { | ||
| 1375 | + return $this->prev_sibling(); | ||
| 1376 | + } | ||
| 1377 | + | ||
| 1378 | + function hasChildNodes() | ||
| 1379 | + { | ||
| 1380 | + return $this->has_child(); | ||
| 1381 | + } | ||
| 1382 | + | ||
| 1383 | + function nodeName() | ||
| 1384 | + { | ||
| 1385 | + return $this->tag; | ||
| 1386 | + } | ||
| 1387 | + | ||
| 1388 | + function appendChild($node) | ||
| 1389 | + { | ||
| 1390 | + $node->parent($this); | ||
| 1391 | + return $node; | ||
| 1392 | + } | ||
| 1393 | + | ||
| 1394 | +} | ||
| 1395 | + | ||
| 1396 | +class simple_html_dom | ||
| 1397 | +{ | ||
| 1398 | + public $root = null; | ||
| 1399 | + public $nodes = array(); | ||
| 1400 | + public $callback = null; | ||
| 1401 | + public $lowercase = false; | ||
| 1402 | + public $original_size; | ||
| 1403 | + public $size; | ||
| 1404 | + | ||
| 1405 | + protected $pos; | ||
| 1406 | + protected $doc; | ||
| 1407 | + protected $char; | ||
| 1408 | + | ||
| 1409 | + protected $cursor; | ||
| 1410 | + protected $parent; | ||
| 1411 | + protected $noise = array(); | ||
| 1412 | + protected $token_blank = " \t\r\n"; | ||
| 1413 | + protected $token_equal = ' =/>'; | ||
| 1414 | + protected $token_slash = " />\r\n\t"; | ||
| 1415 | + protected $token_attr = ' >'; | ||
| 1416 | + | ||
| 1417 | + public $_charset = ''; | ||
| 1418 | + public $_target_charset = ''; | ||
| 1419 | + | ||
| 1420 | + protected $default_br_text = ''; | ||
| 1421 | + | ||
| 1422 | + public $default_span_text = ''; | ||
| 1423 | + | ||
| 1424 | + protected $self_closing_tags = array( | ||
| 1425 | + 'area' => 1, | ||
| 1426 | + 'base' => 1, | ||
| 1427 | + 'br' => 1, | ||
| 1428 | + 'col' => 1, | ||
| 1429 | + 'embed' => 1, | ||
| 1430 | + 'hr' => 1, | ||
| 1431 | + 'img' => 1, | ||
| 1432 | + 'input' => 1, | ||
| 1433 | + 'link' => 1, | ||
| 1434 | + 'meta' => 1, | ||
| 1435 | + 'param' => 1, | ||
| 1436 | + 'source' => 1, | ||
| 1437 | + 'track' => 1, | ||
| 1438 | + 'wbr' => 1 | ||
| 1439 | + ); | ||
| 1440 | + protected $block_tags = array( | ||
| 1441 | + 'body' => 1, | ||
| 1442 | + 'div' => 1, | ||
| 1443 | + 'form' => 1, | ||
| 1444 | + 'root' => 1, | ||
| 1445 | + 'span' => 1, | ||
| 1446 | + 'table' => 1 | ||
| 1447 | + ); | ||
| 1448 | + protected $optional_closing_tags = array( | ||
| 1449 | + // Not optional, see | ||
| 1450 | + // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element | ||
| 1451 | + 'b' => array('b' => 1), | ||
| 1452 | + 'dd' => array('dd' => 1, 'dt' => 1), | ||
| 1453 | + // Not optional, see | ||
| 1454 | + // https://www.w3.org/TR/html/grouping-content.html#the-dl-element | ||
| 1455 | + 'dl' => array('dd' => 1, 'dt' => 1), | ||
| 1456 | + 'dt' => array('dd' => 1, 'dt' => 1), | ||
| 1457 | + 'li' => array('li' => 1), | ||
| 1458 | + 'optgroup' => array('optgroup' => 1, 'option' => 1), | ||
| 1459 | + 'option' => array('optgroup' => 1, 'option' => 1), | ||
| 1460 | + 'p' => array('p' => 1), | ||
| 1461 | + 'rp' => array('rp' => 1, 'rt' => 1), | ||
| 1462 | + 'rt' => array('rp' => 1, 'rt' => 1), | ||
| 1463 | + 'td' => array('td' => 1, 'th' => 1), | ||
| 1464 | + 'th' => array('td' => 1, 'th' => 1), | ||
| 1465 | + 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), | ||
| 1466 | + ); | ||
| 1467 | + | ||
| 1468 | + function __construct( | ||
| 1469 | + $str = null, | ||
| 1470 | + $lowercase = true, | ||
| 1471 | + $forceTagsClosed = true, | ||
| 1472 | + $target_charset = DEFAULT_TARGET_CHARSET, | ||
| 1473 | + $stripRN = true, | ||
| 1474 | + $defaultBRText = DEFAULT_BR_TEXT, | ||
| 1475 | + $defaultSpanText = DEFAULT_SPAN_TEXT, | ||
| 1476 | + $options = 0) | ||
| 1477 | + { | ||
| 1478 | + if ($str) { | ||
| 1479 | + if (preg_match('/^http:\/\//i', $str) || is_file($str)) { | ||
| 1480 | + $this->load_file($str); | ||
| 1481 | + } else { | ||
| 1482 | + $this->load( | ||
| 1483 | + $str, | ||
| 1484 | + $lowercase, | ||
| 1485 | + $stripRN, | ||
| 1486 | + $defaultBRText, | ||
| 1487 | + $defaultSpanText, | ||
| 1488 | + $options | ||
| 1489 | + ); | ||
| 1490 | + } | ||
| 1491 | + } | ||
| 1492 | + // Forcing tags to be closed implies that we don't trust the html, but | ||
| 1493 | + // it can lead to parsing errors if we SHOULD trust the html. | ||
| 1494 | + if (!$forceTagsClosed) { | ||
| 1495 | + $this->optional_closing_array = array(); | ||
| 1496 | + } | ||
| 1497 | + | ||
| 1498 | + $this->_target_charset = $target_charset; | ||
| 1499 | + } | ||
| 1500 | + | ||
| 1501 | + function __destruct() | ||
| 1502 | + { | ||
| 1503 | + $this->clear(); | ||
| 1504 | + } | ||
| 1505 | + | ||
| 1506 | + function load( | ||
| 1507 | + $str, | ||
| 1508 | + $lowercase = true, | ||
| 1509 | + $stripRN = true, | ||
| 1510 | + $defaultBRText = DEFAULT_BR_TEXT, | ||
| 1511 | + $defaultSpanText = DEFAULT_SPAN_TEXT, | ||
| 1512 | + $options = 0) | ||
| 1513 | + { | ||
| 1514 | + global $debug_object; | ||
| 1515 | + | ||
| 1516 | + // prepare | ||
| 1517 | + $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); | ||
| 1518 | + | ||
| 1519 | + // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 | ||
| 1520 | + // Script tags removal now preceeds style tag removal. | ||
| 1521 | + // strip out <script> tags | ||
| 1522 | + $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); | ||
| 1523 | + $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); | ||
| 1524 | + | ||
| 1525 | + // strip out the \r \n's if we are told to. | ||
| 1526 | + if ($stripRN) { | ||
| 1527 | + $this->doc = str_replace("\r", ' ', $this->doc); | ||
| 1528 | + $this->doc = str_replace("\n", ' ', $this->doc); | ||
| 1529 | + | ||
| 1530 | + // set the length of content since we have changed it. | ||
| 1531 | + $this->size = strlen($this->doc); | ||
| 1532 | + } | ||
| 1533 | + | ||
| 1534 | + // strip out cdata | ||
| 1535 | + $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); | ||
| 1536 | + // strip out comments | ||
| 1537 | + $this->remove_noise("'<!--(.*?)-->'is"); | ||
| 1538 | + // strip out <style> tags | ||
| 1539 | + $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); | ||
| 1540 | + $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); | ||
| 1541 | + // strip out preformatted tags | ||
| 1542 | + $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); | ||
| 1543 | + // strip out server side scripts | ||
| 1544 | + $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); | ||
| 1545 | + | ||
| 1546 | + if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts | ||
| 1547 | + $this->remove_noise("'(\{\w)(.*?)(\})'s", true); | ||
| 1548 | + } | ||
| 1549 | + | ||
| 1550 | + // parsing | ||
| 1551 | + $this->parse(); | ||
| 1552 | + // end | ||
| 1553 | + $this->root->_[HDOM_INFO_END] = $this->cursor; | ||
| 1554 | + $this->parse_charset(); | ||
| 1555 | + | ||
| 1556 | + // make load function chainable | ||
| 1557 | + return $this; | ||
| 1558 | + } | ||
| 1559 | + | ||
| 1560 | + function load_file() | ||
| 1561 | + { | ||
| 1562 | + $args = func_get_args(); | ||
| 1563 | + | ||
| 1564 | + if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { | ||
| 1565 | + $this->load($doc, true); | ||
| 1566 | + } else { | ||
| 1567 | + return false; | ||
| 1568 | + } | ||
| 1569 | + } | ||
| 1570 | + | ||
| 1571 | + function set_callback($function_name) | ||
| 1572 | + { | ||
| 1573 | + $this->callback = $function_name; | ||
| 1574 | + } | ||
| 1575 | + | ||
| 1576 | + function remove_callback() | ||
| 1577 | + { | ||
| 1578 | + $this->callback = null; | ||
| 1579 | + } | ||
| 1580 | + | ||
| 1581 | + function save($filepath = '') | ||
| 1582 | + { | ||
| 1583 | + $ret = $this->root->innertext(); | ||
| 1584 | + if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } | ||
| 1585 | + return $ret; | ||
| 1586 | + } | ||
| 1587 | + | ||
| 1588 | + function find($selector, $idx = null, $lowercase = false) | ||
| 1589 | + { | ||
| 1590 | + return $this->root->find($selector, $idx, $lowercase); | ||
| 1591 | + } | ||
| 1592 | + | ||
| 1593 | + function clear() | ||
| 1594 | + { | ||
| 1595 | + if (isset($this->nodes)) { | ||
| 1596 | + foreach ($this->nodes as $n) { | ||
| 1597 | + $n->clear(); | ||
| 1598 | + $n = null; | ||
| 1599 | + } | ||
| 1600 | + } | ||
| 1601 | + | ||
| 1602 | + // This add next line is documented in the sourceforge repository. | ||
| 1603 | + // 2977248 as a fix for ongoing memory leaks that occur even with the | ||
| 1604 | + // use of clear. | ||
| 1605 | + if (isset($this->children)) { | ||
| 1606 | + foreach ($this->children as $n) { | ||
| 1607 | + $n->clear(); | ||
| 1608 | + $n = null; | ||
| 1609 | + } | ||
| 1610 | + } | ||
| 1611 | + | ||
| 1612 | + if (isset($this->parent)) { | ||
| 1613 | + $this->parent->clear(); | ||
| 1614 | + unset($this->parent); | ||
| 1615 | + } | ||
| 1616 | + | ||
| 1617 | + if (isset($this->root)) { | ||
| 1618 | + $this->root->clear(); | ||
| 1619 | + unset($this->root); | ||
| 1620 | + } | ||
| 1621 | + | ||
| 1622 | + unset($this->doc); | ||
| 1623 | + unset($this->noise); | ||
| 1624 | + } | ||
| 1625 | + | ||
| 1626 | + function dump($show_attr = true) | ||
| 1627 | + { | ||
| 1628 | + $this->root->dump($show_attr); | ||
| 1629 | + } | ||
| 1630 | + | ||
| 1631 | + protected function prepare( | ||
| 1632 | + $str, $lowercase = true, | ||
| 1633 | + $defaultBRText = DEFAULT_BR_TEXT, | ||
| 1634 | + $defaultSpanText = DEFAULT_SPAN_TEXT) | ||
| 1635 | + { | ||
| 1636 | + $this->clear(); | ||
| 1637 | + | ||
| 1638 | + $this->doc = trim($str); | ||
| 1639 | + $this->size = strlen($this->doc); | ||
| 1640 | + $this->original_size = $this->size; // original size of the html | ||
| 1641 | + $this->pos = 0; | ||
| 1642 | + $this->cursor = 1; | ||
| 1643 | + $this->noise = array(); | ||
| 1644 | + $this->nodes = array(); | ||
| 1645 | + $this->lowercase = $lowercase; | ||
| 1646 | + $this->default_br_text = $defaultBRText; | ||
| 1647 | + $this->default_span_text = $defaultSpanText; | ||
| 1648 | + $this->root = new simple_html_dom_node($this); | ||
| 1649 | + $this->root->tag = 'root'; | ||
| 1650 | + $this->root->_[HDOM_INFO_BEGIN] = -1; | ||
| 1651 | + $this->root->nodetype = HDOM_TYPE_ROOT; | ||
| 1652 | + $this->parent = $this->root; | ||
| 1653 | + if ($this->size > 0) { $this->char = $this->doc[0]; } | ||
| 1654 | + } | ||
| 1655 | + | ||
| 1656 | + protected function parse() | ||
| 1657 | + { | ||
| 1658 | + while (true) { | ||
| 1659 | + // Read next tag if there is no text between current position and the | ||
| 1660 | + // next opening tag. | ||
| 1661 | + if (($s = $this->copy_until_char('<')) === '') { | ||
| 1662 | + if($this->read_tag()) { | ||
| 1663 | + continue; | ||
| 1664 | + } else { | ||
| 1665 | + return true; | ||
| 1666 | + } | ||
| 1667 | + } | ||
| 1668 | + | ||
| 1669 | + // Add a text node for text between tags | ||
| 1670 | + $node = new simple_html_dom_node($this); | ||
| 1671 | + ++$this->cursor; | ||
| 1672 | + $node->_[HDOM_INFO_TEXT] = $s; | ||
| 1673 | + $this->link_nodes($node, false); | ||
| 1674 | + } | ||
| 1675 | + } | ||
| 1676 | + | ||
| 1677 | + protected function parse_charset() | ||
| 1678 | + { | ||
| 1679 | + global $debug_object; | ||
| 1680 | + | ||
| 1681 | + $charset = null; | ||
| 1682 | + | ||
| 1683 | + if (function_exists('get_last_retrieve_url_contents_content_type')) { | ||
| 1684 | + $contentTypeHeader = get_last_retrieve_url_contents_content_type(); | ||
| 1685 | + $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); | ||
| 1686 | + if ($success) { | ||
| 1687 | + $charset = $matches[1]; | ||
| 1688 | + if (is_object($debug_object)) { | ||
| 1689 | + $debug_object->debug_log(2, | ||
| 1690 | + 'header content-type found charset of: ' | ||
| 1691 | + . $charset | ||
| 1692 | + ); | ||
| 1693 | + } | ||
| 1694 | + } | ||
| 1695 | + } | ||
| 1696 | + | ||
| 1697 | + if (empty($charset)) { | ||
| 1698 | + // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type | ||
| 1699 | + $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); | ||
| 1700 | + | ||
| 1701 | + if (!empty($el)) { | ||
| 1702 | + $fullvalue = $el->content; | ||
| 1703 | + if (is_object($debug_object)) { | ||
| 1704 | + $debug_object->debug_log(2, | ||
| 1705 | + 'meta content-type tag found' | ||
| 1706 | + . $fullvalue | ||
| 1707 | + ); | ||
| 1708 | + } | ||
| 1709 | + | ||
| 1710 | + if (!empty($fullvalue)) { | ||
| 1711 | + $success = preg_match( | ||
| 1712 | + '/charset=(.+)/i', | ||
| 1713 | + $fullvalue, | ||
| 1714 | + $matches | ||
| 1715 | + ); | ||
| 1716 | + | ||
| 1717 | + if ($success) { | ||
| 1718 | + $charset = $matches[1]; | ||
| 1719 | + } else { | ||
| 1720 | + // If there is a meta tag, and they don't specify the | ||
| 1721 | + // character set, research says that it's typically | ||
| 1722 | + // ISO-8859-1 | ||
| 1723 | + if (is_object($debug_object)) { | ||
| 1724 | + $debug_object->debug_log(2, | ||
| 1725 | + 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' | ||
| 1726 | + ); | ||
| 1727 | + } | ||
| 1728 | + | ||
| 1729 | + $charset = 'ISO-8859-1'; | ||
| 1730 | + } | ||
| 1731 | + } | ||
| 1732 | + } | ||
| 1733 | + } | ||
| 1734 | + | ||
| 1735 | + if (empty($charset)) { | ||
| 1736 | + // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration | ||
| 1737 | + if ($meta = $this->root->find('meta[charset]', 0)) { | ||
| 1738 | + $charset = $meta->charset; | ||
| 1739 | + if (is_object($debug_object)) { | ||
| 1740 | + $debug_object->debug_log(2, 'meta charset: ' . $charset); | ||
| 1741 | + } | ||
| 1742 | + } | ||
| 1743 | + } | ||
| 1744 | + | ||
| 1745 | + if (empty($charset)) { | ||
| 1746 | + // Try to guess the charset based on the content | ||
| 1747 | + // Requires Multibyte String (mbstring) support (optional) | ||
| 1748 | + if (function_exists('mb_detect_encoding')) { | ||
| 1749 | + /** | ||
| 1750 | + * mb_detect_encoding() is not intended to distinguish between | ||
| 1751 | + * charsets, especially single-byte charsets. Its primary | ||
| 1752 | + * purpose is to detect which multibyte encoding is in use, | ||
| 1753 | + * i.e. UTF-8, UTF-16, shift-JIS, etc. | ||
| 1754 | + * | ||
| 1755 | + * -- https://bugs.php.net/bug.php?id=38138 | ||
| 1756 | + * | ||
| 1757 | + * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will | ||
| 1758 | + * always result in CP1251/ISO-8859-5 and vice versa. | ||
| 1759 | + * | ||
| 1760 | + * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 | ||
| 1761 | + * to stay compatible. | ||
| 1762 | + */ | ||
| 1763 | + $encoding = mb_detect_encoding( | ||
| 1764 | + $this->doc, | ||
| 1765 | + array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) | ||
| 1766 | + ); | ||
| 1767 | + | ||
| 1768 | + if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { | ||
| 1769 | + // Due to a limitation of mb_detect_encoding | ||
| 1770 | + // 'CP1251'/'ISO-8859-5' will be detected as | ||
| 1771 | + // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in | ||
| 1772 | + // which case we can simply assume it is the other charset. | ||
| 1773 | + if (!@iconv('CP1252', 'UTF-8', $this->doc)) { | ||
| 1774 | + $encoding = 'CP1251'; | ||
| 1775 | + } | ||
| 1776 | + } | ||
| 1777 | + | ||
| 1778 | + if ($encoding !== false) { | ||
| 1779 | + $charset = $encoding; | ||
| 1780 | + if (is_object($debug_object)) { | ||
| 1781 | + $debug_object->debug_log(2, 'mb_detect: ' . $charset); | ||
| 1782 | + } | ||
| 1783 | + } | ||
| 1784 | + } | ||
| 1785 | + } | ||
| 1786 | + | ||
| 1787 | + if (empty($charset)) { | ||
| 1788 | + // Assume it's UTF-8 as it is the most likely charset to be used | ||
| 1789 | + $charset = 'UTF-8'; | ||
| 1790 | + if (is_object($debug_object)) { | ||
| 1791 | + $debug_object->debug_log(2, 'No match found, assume ' . $charset); | ||
| 1792 | + } | ||
| 1793 | + } | ||
| 1794 | + | ||
| 1795 | + // Since CP1252 is a superset, if we get one of it's subsets, we want | ||
| 1796 | + // it instead. | ||
| 1797 | + if ((strtolower($charset) == 'iso-8859-1') | ||
| 1798 | + || (strtolower($charset) == 'latin1') | ||
| 1799 | + || (strtolower($charset) == 'latin-1')) { | ||
| 1800 | + $charset = 'CP1252'; | ||
| 1801 | + if (is_object($debug_object)) { | ||
| 1802 | + $debug_object->debug_log(2, | ||
| 1803 | + 'replacing ' . $charset . ' with CP1252 as its a superset' | ||
| 1804 | + ); | ||
| 1805 | + } | ||
| 1806 | + } | ||
| 1807 | + | ||
| 1808 | + if (is_object($debug_object)) { | ||
| 1809 | + $debug_object->debug_log(1, 'EXIT - ' . $charset); | ||
| 1810 | + } | ||
| 1811 | + | ||
| 1812 | + return $this->_charset = $charset; | ||
| 1813 | + } | ||
| 1814 | + | ||
| 1815 | + protected function read_tag() | ||
| 1816 | + { | ||
| 1817 | + // Set end position if no further tags found | ||
| 1818 | + if ($this->char !== '<') { | ||
| 1819 | + $this->root->_[HDOM_INFO_END] = $this->cursor; | ||
| 1820 | + return false; | ||
| 1821 | + } | ||
| 1822 | + | ||
| 1823 | + $begin_tag_pos = $this->pos; | ||
| 1824 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 1825 | + | ||
| 1826 | + // end tag | ||
| 1827 | + if ($this->char === '/') { | ||
| 1828 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 1829 | + | ||
| 1830 | + // Skip whitespace in end tags (i.e. in "</ html>") | ||
| 1831 | + $this->skip($this->token_blank); | ||
| 1832 | + $tag = $this->copy_until_char('>'); | ||
| 1833 | + | ||
| 1834 | + // Skip attributes in end tags | ||
| 1835 | + if (($pos = strpos($tag, ' ')) !== false) { | ||
| 1836 | + $tag = substr($tag, 0, $pos); | ||
| 1837 | + } | ||
| 1838 | + | ||
| 1839 | + $parent_lower = strtolower($this->parent->tag); | ||
| 1840 | + $tag_lower = strtolower($tag); | ||
| 1841 | + | ||
| 1842 | + // The end tag is supposed to close the parent tag. Handle situations | ||
| 1843 | + // when it doesn't | ||
| 1844 | + if ($parent_lower !== $tag_lower) { | ||
| 1845 | + // Parent tag does not have to be closed necessarily (optional closing tag) | ||
| 1846 | + // Current tag is a block tag, so it may close an ancestor | ||
| 1847 | + if (isset($this->optional_closing_tags[$parent_lower]) | ||
| 1848 | + && isset($this->block_tags[$tag_lower])) { | ||
| 1849 | + | ||
| 1850 | + $this->parent->_[HDOM_INFO_END] = 0; | ||
| 1851 | + $org_parent = $this->parent; | ||
| 1852 | + | ||
| 1853 | + // Traverse ancestors to find a matching opening tag | ||
| 1854 | + // Stop at root node | ||
| 1855 | + while (($this->parent->parent) | ||
| 1856 | + && strtolower($this->parent->tag) !== $tag_lower | ||
| 1857 | + ){ | ||
| 1858 | + $this->parent = $this->parent->parent; | ||
| 1859 | + } | ||
| 1860 | + | ||
| 1861 | + // If we don't have a match add current tag as text node | ||
| 1862 | + if (strtolower($this->parent->tag) !== $tag_lower) { | ||
| 1863 | + $this->parent = $org_parent; // restore origonal parent | ||
| 1864 | + | ||
| 1865 | + if ($this->parent->parent) { | ||
| 1866 | + $this->parent = $this->parent->parent; | ||
| 1867 | + } | ||
| 1868 | + | ||
| 1869 | + $this->parent->_[HDOM_INFO_END] = $this->cursor; | ||
| 1870 | + return $this->as_text_node($tag); | ||
| 1871 | + } | ||
| 1872 | + } elseif (($this->parent->parent) | ||
| 1873 | + && isset($this->block_tags[$tag_lower]) | ||
| 1874 | + ) { | ||
| 1875 | + // Grandparent exists and current tag is a block tag, so our | ||
| 1876 | + // parent doesn't have an end tag | ||
| 1877 | + $this->parent->_[HDOM_INFO_END] = 0; // No end tag | ||
| 1878 | + $org_parent = $this->parent; | ||
| 1879 | + | ||
| 1880 | + // Traverse ancestors to find a matching opening tag | ||
| 1881 | + // Stop at root node | ||
| 1882 | + while (($this->parent->parent) | ||
| 1883 | + && strtolower($this->parent->tag) !== $tag_lower | ||
| 1884 | + ) { | ||
| 1885 | + $this->parent = $this->parent->parent; | ||
| 1886 | + } | ||
| 1887 | + | ||
| 1888 | + // If we don't have a match add current tag as text node | ||
| 1889 | + if (strtolower($this->parent->tag) !== $tag_lower) { | ||
| 1890 | + $this->parent = $org_parent; // restore origonal parent | ||
| 1891 | + $this->parent->_[HDOM_INFO_END] = $this->cursor; | ||
| 1892 | + return $this->as_text_node($tag); | ||
| 1893 | + } | ||
| 1894 | + } elseif (($this->parent->parent) | ||
| 1895 | + && strtolower($this->parent->parent->tag) === $tag_lower | ||
| 1896 | + ) { // Grandparent exists and current tag closes it | ||
| 1897 | + $this->parent->_[HDOM_INFO_END] = 0; | ||
| 1898 | + $this->parent = $this->parent->parent; | ||
| 1899 | + } else { // Random tag, add as text node | ||
| 1900 | + return $this->as_text_node($tag); | ||
| 1901 | + } | ||
| 1902 | + } | ||
| 1903 | + | ||
| 1904 | + // Set end position of parent tag to current cursor position | ||
| 1905 | + $this->parent->_[HDOM_INFO_END] = $this->cursor; | ||
| 1906 | + | ||
| 1907 | + if ($this->parent->parent) { | ||
| 1908 | + $this->parent = $this->parent->parent; | ||
| 1909 | + } | ||
| 1910 | + | ||
| 1911 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 1912 | + return true; | ||
| 1913 | + } | ||
| 1914 | + | ||
| 1915 | + // start tag | ||
| 1916 | + $node = new simple_html_dom_node($this); | ||
| 1917 | + $node->_[HDOM_INFO_BEGIN] = $this->cursor; | ||
| 1918 | + ++$this->cursor; | ||
| 1919 | + $tag = $this->copy_until($this->token_slash); // Get tag name | ||
| 1920 | + $node->tag_start = $begin_tag_pos; | ||
| 1921 | + | ||
| 1922 | + // doctype, cdata & comments... | ||
| 1923 | + // <!DOCTYPE html> | ||
| 1924 | + // <![CDATA[ ... ]]> | ||
| 1925 | + // <!-- Comment --> | ||
| 1926 | + if (isset($tag[0]) && $tag[0] === '!') { | ||
| 1927 | + $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); | ||
| 1928 | + | ||
| 1929 | + if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") | ||
| 1930 | + $node->nodetype = HDOM_TYPE_COMMENT; | ||
| 1931 | + $node->tag = 'comment'; | ||
| 1932 | + } else { // Could be doctype or CDATA but we don't care | ||
| 1933 | + $node->nodetype = HDOM_TYPE_UNKNOWN; | ||
| 1934 | + $node->tag = 'unknown'; | ||
| 1935 | + } | ||
| 1936 | + | ||
| 1937 | + if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } | ||
| 1938 | + | ||
| 1939 | + $this->link_nodes($node, true); | ||
| 1940 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 1941 | + return true; | ||
| 1942 | + } | ||
| 1943 | + | ||
| 1944 | + // The start tag cannot contain another start tag, if so add as text | ||
| 1945 | + // i.e. "<<html>" | ||
| 1946 | + if ($pos = strpos($tag, '<') !== false) { | ||
| 1947 | + $tag = '<' . substr($tag, 0, -1); | ||
| 1948 | + $node->_[HDOM_INFO_TEXT] = $tag; | ||
| 1949 | + $this->link_nodes($node, false); | ||
| 1950 | + $this->char = $this->doc[--$this->pos]; // prev | ||
| 1951 | + return true; | ||
| 1952 | + } | ||
| 1953 | + | ||
| 1954 | + // Handle invalid tag names (i.e. "<html#doc>") | ||
| 1955 | + if (!preg_match('/^\w[\w:-]*$/', $tag)) { | ||
| 1956 | + $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); | ||
| 1957 | + | ||
| 1958 | + // Next char is the beginning of a new tag, don't touch it. | ||
| 1959 | + if ($this->char === '<') { | ||
| 1960 | + $this->link_nodes($node, false); | ||
| 1961 | + return true; | ||
| 1962 | + } | ||
| 1963 | + | ||
| 1964 | + // Next char closes current tag, add and be done with it. | ||
| 1965 | + if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } | ||
| 1966 | + $this->link_nodes($node, false); | ||
| 1967 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 1968 | + return true; | ||
| 1969 | + } | ||
| 1970 | + | ||
| 1971 | + // begin tag, add new node | ||
| 1972 | + $node->nodetype = HDOM_TYPE_ELEMENT; | ||
| 1973 | + $tag_lower = strtolower($tag); | ||
| 1974 | + $node->tag = ($this->lowercase) ? $tag_lower : $tag; | ||
| 1975 | + | ||
| 1976 | + // handle optional closing tags | ||
| 1977 | + if (isset($this->optional_closing_tags[$tag_lower])) { | ||
| 1978 | + // Traverse ancestors to close all optional closing tags | ||
| 1979 | + while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { | ||
| 1980 | + $this->parent->_[HDOM_INFO_END] = 0; | ||
| 1981 | + $this->parent = $this->parent->parent; | ||
| 1982 | + } | ||
| 1983 | + $node->parent = $this->parent; | ||
| 1984 | + } | ||
| 1985 | + | ||
| 1986 | + $guard = 0; // prevent infinity loop | ||
| 1987 | + | ||
| 1988 | + // [0] Space between tag and first attribute | ||
| 1989 | + $space = array($this->copy_skip($this->token_blank), '', ''); | ||
| 1990 | + | ||
| 1991 | + // attributes | ||
| 1992 | + do { | ||
| 1993 | + // Everything until the first equal sign should be the attribute name | ||
| 1994 | + $name = $this->copy_until($this->token_equal); | ||
| 1995 | + | ||
| 1996 | + if ($name === '' && $this->char !== null && $space[0] === '') { | ||
| 1997 | + break; | ||
| 1998 | + } | ||
| 1999 | + | ||
| 2000 | + if ($guard === $this->pos) { // Escape infinite loop | ||
| 2001 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2002 | + continue; | ||
| 2003 | + } | ||
| 2004 | + | ||
| 2005 | + $guard = $this->pos; | ||
| 2006 | + | ||
| 2007 | + // handle endless '<' | ||
| 2008 | + // Out of bounds before the tag ended | ||
| 2009 | + if ($this->pos >= $this->size - 1 && $this->char !== '>') { | ||
| 2010 | + $node->nodetype = HDOM_TYPE_TEXT; | ||
| 2011 | + $node->_[HDOM_INFO_END] = 0; | ||
| 2012 | + $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; | ||
| 2013 | + $node->tag = 'text'; | ||
| 2014 | + $this->link_nodes($node, false); | ||
| 2015 | + return true; | ||
| 2016 | + } | ||
| 2017 | + | ||
| 2018 | + // handle mismatch '<' | ||
| 2019 | + // Attributes cannot start after opening tag | ||
| 2020 | + if ($this->doc[$this->pos - 1] == '<') { | ||
| 2021 | + $node->nodetype = HDOM_TYPE_TEXT; | ||
| 2022 | + $node->tag = 'text'; | ||
| 2023 | + $node->attr = array(); | ||
| 2024 | + $node->_[HDOM_INFO_END] = 0; | ||
| 2025 | + $node->_[HDOM_INFO_TEXT] = substr( | ||
| 2026 | + $this->doc, | ||
| 2027 | + $begin_tag_pos, | ||
| 2028 | + $this->pos - $begin_tag_pos - 1 | ||
| 2029 | + ); | ||
| 2030 | + $this->pos -= 2; | ||
| 2031 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2032 | + $this->link_nodes($node, false); | ||
| 2033 | + return true; | ||
| 2034 | + } | ||
| 2035 | + | ||
| 2036 | + if ($name !== '/' && $name !== '') { // this is a attribute name | ||
| 2037 | + // [1] Whitespace after attribute name | ||
| 2038 | + $space[1] = $this->copy_skip($this->token_blank); | ||
| 2039 | + | ||
| 2040 | + $name = $this->restore_noise($name); // might be a noisy name | ||
| 2041 | + | ||
| 2042 | + if ($this->lowercase) { $name = strtolower($name); } | ||
| 2043 | + | ||
| 2044 | + if ($this->char === '=') { // attribute with value | ||
| 2045 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2046 | + $this->parse_attr($node, $name, $space); // get attribute value | ||
| 2047 | + } else { | ||
| 2048 | + //no value attr: nowrap, checked selected... | ||
| 2049 | + $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; | ||
| 2050 | + $node->attr[$name] = true; | ||
| 2051 | + if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev | ||
| 2052 | + } | ||
| 2053 | + | ||
| 2054 | + $node->_[HDOM_INFO_SPACE][] = $space; | ||
| 2055 | + | ||
| 2056 | + // prepare for next attribute | ||
| 2057 | + $space = array( | ||
| 2058 | + $this->copy_skip($this->token_blank), | ||
| 2059 | + '', | ||
| 2060 | + '' | ||
| 2061 | + ); | ||
| 2062 | + } else { // no more attributes | ||
| 2063 | + break; | ||
| 2064 | + } | ||
| 2065 | + } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended | ||
| 2066 | + | ||
| 2067 | + $this->link_nodes($node, true); | ||
| 2068 | + $node->_[HDOM_INFO_ENDSPACE] = $space[0]; | ||
| 2069 | + | ||
| 2070 | + // handle empty tags (i.e. "<div/>") | ||
| 2071 | + if ($this->copy_until_char('>') === '/') { | ||
| 2072 | + $node->_[HDOM_INFO_ENDSPACE] .= '/'; | ||
| 2073 | + $node->_[HDOM_INFO_END] = 0; | ||
| 2074 | + } else { | ||
| 2075 | + // reset parent | ||
| 2076 | + if (!isset($this->self_closing_tags[strtolower($node->tag)])) { | ||
| 2077 | + $this->parent = $node; | ||
| 2078 | + } | ||
| 2079 | + } | ||
| 2080 | + | ||
| 2081 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2082 | + | ||
| 2083 | + // If it's a BR tag, we need to set it's text to the default text. | ||
| 2084 | + // This way when we see it in plaintext, we can generate formatting that the user wants. | ||
| 2085 | + // since a br tag never has sub nodes, this works well. | ||
| 2086 | + if ($node->tag === 'br') { | ||
| 2087 | + $node->_[HDOM_INFO_INNER] = $this->default_br_text; | ||
| 2088 | + } | ||
| 2089 | + | ||
| 2090 | + return true; | ||
| 2091 | + } | ||
| 2092 | + | ||
| 2093 | + protected function parse_attr($node, $name, &$space) | ||
| 2094 | + { | ||
| 2095 | + $is_duplicate = isset($node->attr[$name]); | ||
| 2096 | + | ||
| 2097 | + if (!$is_duplicate) // Copy whitespace between "=" and value | ||
| 2098 | + $space[2] = $this->copy_skip($this->token_blank); | ||
| 2099 | + | ||
| 2100 | + switch ($this->char) { | ||
| 2101 | + case '"': | ||
| 2102 | + $quote_type = HDOM_QUOTE_DOUBLE; | ||
| 2103 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2104 | + $value = $this->copy_until_char('"'); | ||
| 2105 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2106 | + break; | ||
| 2107 | + case '\'': | ||
| 2108 | + $quote_type = HDOM_QUOTE_SINGLE; | ||
| 2109 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2110 | + $value = $this->copy_until_char('\''); | ||
| 2111 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2112 | + break; | ||
| 2113 | + default: | ||
| 2114 | + $quote_type = HDOM_QUOTE_NO; | ||
| 2115 | + $value = $this->copy_until($this->token_attr); | ||
| 2116 | + } | ||
| 2117 | + | ||
| 2118 | + $value = $this->restore_noise($value); | ||
| 2119 | + | ||
| 2120 | + // PaperG: Attributes should not have \r or \n in them, that counts as | ||
| 2121 | + // html whitespace. | ||
| 2122 | + $value = str_replace("\r", '', $value); | ||
| 2123 | + $value = str_replace("\n", '', $value); | ||
| 2124 | + | ||
| 2125 | + // PaperG: If this is a "class" selector, lets get rid of the preceeding | ||
| 2126 | + // and trailing space since some people leave it in the multi class case. | ||
| 2127 | + if ($name === 'class') { | ||
| 2128 | + $value = trim($value); | ||
| 2129 | + } | ||
| 2130 | + | ||
| 2131 | + if (!$is_duplicate) { | ||
| 2132 | + $node->_[HDOM_INFO_QUOTE][] = $quote_type; | ||
| 2133 | + $node->attr[$name] = $value; | ||
| 2134 | + } | ||
| 2135 | + } | ||
| 2136 | + | ||
| 2137 | + protected function link_nodes(&$node, $is_child) | ||
| 2138 | + { | ||
| 2139 | + $node->parent = $this->parent; | ||
| 2140 | + $this->parent->nodes[] = $node; | ||
| 2141 | + if ($is_child) { | ||
| 2142 | + $this->parent->children[] = $node; | ||
| 2143 | + } | ||
| 2144 | + } | ||
| 2145 | + | ||
| 2146 | + protected function as_text_node($tag) | ||
| 2147 | + { | ||
| 2148 | + $node = new simple_html_dom_node($this); | ||
| 2149 | + ++$this->cursor; | ||
| 2150 | + $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; | ||
| 2151 | + $this->link_nodes($node, false); | ||
| 2152 | + $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2153 | + return true; | ||
| 2154 | + } | ||
| 2155 | + | ||
| 2156 | + protected function skip($chars) | ||
| 2157 | + { | ||
| 2158 | + $this->pos += strspn($this->doc, $chars, $this->pos); | ||
| 2159 | + $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2160 | + } | ||
| 2161 | + | ||
| 2162 | + protected function copy_skip($chars) | ||
| 2163 | + { | ||
| 2164 | + $pos = $this->pos; | ||
| 2165 | + $len = strspn($this->doc, $chars, $pos); | ||
| 2166 | + $this->pos += $len; | ||
| 2167 | + $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2168 | + if ($len === 0) { return ''; } | ||
| 2169 | + return substr($this->doc, $pos, $len); | ||
| 2170 | + } | ||
| 2171 | + | ||
| 2172 | + protected function copy_until($chars) | ||
| 2173 | + { | ||
| 2174 | + $pos = $this->pos; | ||
| 2175 | + $len = strcspn($this->doc, $chars, $pos); | ||
| 2176 | + $this->pos += $len; | ||
| 2177 | + $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | ||
| 2178 | + return substr($this->doc, $pos, $len); | ||
| 2179 | + } | ||
| 2180 | + | ||
| 2181 | + protected function copy_until_char($char) | ||
| 2182 | + { | ||
| 2183 | + if ($this->char === null) { return ''; } | ||
| 2184 | + | ||
| 2185 | + if (($pos = strpos($this->doc, $char, $this->pos)) === false) { | ||
| 2186 | + $ret = substr($this->doc, $this->pos, $this->size - $this->pos); | ||
| 2187 | + $this->char = null; | ||
| 2188 | + $this->pos = $this->size; | ||
| 2189 | + return $ret; | ||
| 2190 | + } | ||
| 2191 | + | ||
| 2192 | + if ($pos === $this->pos) { return ''; } | ||
| 2193 | + | ||
| 2194 | + $pos_old = $this->pos; | ||
| 2195 | + $this->char = $this->doc[$pos]; | ||
| 2196 | + $this->pos = $pos; | ||
| 2197 | + return substr($this->doc, $pos_old, $pos - $pos_old); | ||
| 2198 | + } | ||
| 2199 | + | ||
| 2200 | + protected function remove_noise($pattern, $remove_tag = false) | ||
| 2201 | + { | ||
| 2202 | + global $debug_object; | ||
| 2203 | + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | ||
| 2204 | + | ||
| 2205 | + $count = preg_match_all( | ||
| 2206 | + $pattern, | ||
| 2207 | + $this->doc, | ||
| 2208 | + $matches, | ||
| 2209 | + PREG_SET_ORDER | PREG_OFFSET_CAPTURE | ||
| 2210 | + ); | ||
| 2211 | + | ||
| 2212 | + for ($i = $count - 1; $i > -1; --$i) { | ||
| 2213 | + $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); | ||
| 2214 | + | ||
| 2215 | + if (is_object($debug_object)) { | ||
| 2216 | + $debug_object->debug_log(2, 'key is: ' . $key); | ||
| 2217 | + } | ||
| 2218 | + | ||
| 2219 | + $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch | ||
| 2220 | + $this->noise[$key] = $matches[$i][$idx][0]; | ||
| 2221 | + $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); | ||
| 2222 | + } | ||
| 2223 | + | ||
| 2224 | + // reset the length of content | ||
| 2225 | + $this->size = strlen($this->doc); | ||
| 2226 | + | ||
| 2227 | + if ($this->size > 0) { | ||
| 2228 | + $this->char = $this->doc[0]; | ||
| 2229 | + } | ||
| 2230 | + } | ||
| 2231 | + | ||
| 2232 | + function restore_noise($text) | ||
| 2233 | + { | ||
| 2234 | + global $debug_object; | ||
| 2235 | + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | ||
| 2236 | + | ||
| 2237 | + while (($pos = strpos($text, '___noise___')) !== false) { | ||
| 2238 | + // Sometimes there is a broken piece of markup, and we don't GET the | ||
| 2239 | + // pos+11 etc... token which indicates a problem outside of us... | ||
| 2240 | + | ||
| 2241 | + // todo: "___noise___1000" (or any number with four or more digits) | ||
| 2242 | + // in the DOM causes an infinite loop which could be utilized by | ||
| 2243 | + // malicious software | ||
| 2244 | + if (strlen($text) > $pos + 15) { | ||
| 2245 | + $key = '___noise___' | ||
| 2246 | + . $text[$pos + 11] | ||
| 2247 | + . $text[$pos + 12] | ||
| 2248 | + . $text[$pos + 13] | ||
| 2249 | + . $text[$pos + 14] | ||
| 2250 | + . $text[$pos + 15]; | ||
| 2251 | + | ||
| 2252 | + if (is_object($debug_object)) { | ||
| 2253 | + $debug_object->debug_log(2, 'located key of: ' . $key); | ||
| 2254 | + } | ||
| 2255 | + | ||
| 2256 | + if (isset($this->noise[$key])) { | ||
| 2257 | + $text = substr($text, 0, $pos) | ||
| 2258 | + . $this->noise[$key] | ||
| 2259 | + . substr($text, $pos + 16); | ||
| 2260 | + } else { | ||
| 2261 | + // do this to prevent an infinite loop. | ||
| 2262 | + $text = substr($text, 0, $pos) | ||
| 2263 | + . 'UNDEFINED NOISE FOR KEY: ' | ||
| 2264 | + . $key | ||
| 2265 | + . substr($text, $pos + 16); | ||
| 2266 | + } | ||
| 2267 | + } else { | ||
| 2268 | + // There is no valid key being given back to us... We must get | ||
| 2269 | + // rid of the ___noise___ or we will have a problem. | ||
| 2270 | + $text = substr($text, 0, $pos) | ||
| 2271 | + . 'NO NUMERIC NOISE KEY' | ||
| 2272 | + . substr($text, $pos + 11); | ||
| 2273 | + } | ||
| 2274 | + } | ||
| 2275 | + return $text; | ||
| 2276 | + } | ||
| 2277 | + | ||
| 2278 | + function search_noise($text) | ||
| 2279 | + { | ||
| 2280 | + global $debug_object; | ||
| 2281 | + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | ||
| 2282 | + | ||
| 2283 | + foreach($this->noise as $noiseElement) { | ||
| 2284 | + if (strpos($noiseElement, $text) !== false) { | ||
| 2285 | + return $noiseElement; | ||
| 2286 | + } | ||
| 2287 | + } | ||
| 2288 | + } | ||
| 2289 | + | ||
| 2290 | + function __toString() | ||
| 2291 | + { | ||
| 2292 | + return $this->root->innertext(); | ||
| 2293 | + } | ||
| 2294 | + | ||
| 2295 | + function __get($name) | ||
| 2296 | + { | ||
| 2297 | + switch ($name) { | ||
| 2298 | + case 'outertext': | ||
| 2299 | + return $this->root->innertext(); | ||
| 2300 | + case 'innertext': | ||
| 2301 | + return $this->root->innertext(); | ||
| 2302 | + case 'plaintext': | ||
| 2303 | + return $this->root->text(); | ||
| 2304 | + case 'charset': | ||
| 2305 | + return $this->_charset; | ||
| 2306 | + case 'target_charset': | ||
| 2307 | + return $this->_target_charset; | ||
| 2308 | + } | ||
| 2309 | + } | ||
| 2310 | + | ||
| 2311 | + function childNodes($idx = -1) | ||
| 2312 | + { | ||
| 2313 | + return $this->root->childNodes($idx); | ||
| 2314 | + } | ||
| 2315 | + | ||
| 2316 | + function firstChild() | ||
| 2317 | + { | ||
| 2318 | + return $this->root->first_child(); | ||
| 2319 | + } | ||
| 2320 | + | ||
| 2321 | + function lastChild() | ||
| 2322 | + { | ||
| 2323 | + return $this->root->last_child(); | ||
| 2324 | + } | ||
| 2325 | + | ||
| 2326 | + function createElement($name, $value = null) | ||
| 2327 | + { | ||
| 2328 | + return @str_get_html("<$name>$value</$name>")->firstChild(); | ||
| 2329 | + } | ||
| 2330 | + | ||
| 2331 | + function createTextNode($value) | ||
| 2332 | + { | ||
| 2333 | + return @end(str_get_html($value)->nodes); | ||
| 2334 | + } | ||
| 2335 | + | ||
| 2336 | + function getElementById($id) | ||
| 2337 | + { | ||
| 2338 | + return $this->find("#$id", 0); | ||
| 2339 | + } | ||
| 2340 | + | ||
| 2341 | + function getElementsById($id, $idx = null) | ||
| 2342 | + { | ||
| 2343 | + return $this->find("#$id", $idx); | ||
| 2344 | + } | ||
| 2345 | + | ||
| 2346 | + function getElementByTagName($name) | ||
| 2347 | + { | ||
| 2348 | + return $this->find($name, 0); | ||
| 2349 | + } | ||
| 2350 | + | ||
| 2351 | + function getElementsByTagName($name, $idx = -1) | ||
| 2352 | + { | ||
| 2353 | + return $this->find($name, $idx); | ||
| 2354 | + } | ||
| 2355 | + | ||
| 2356 | + function loadFile() | ||
| 2357 | + { | ||
| 2358 | + $args = func_get_args(); | ||
| 2359 | + $this->load_file($args); | ||
| 2360 | + } | ||
| 2361 | +} |
| @@ -157,56 +157,33 @@ class TranslateLogic extends BaseLogic | @@ -157,56 +157,33 @@ class TranslateLogic extends BaseLogic | ||
| 157 | * @time :2023/11/22 10:02 | 157 | * @time :2023/11/22 10:02 |
| 158 | */ | 158 | */ |
| 159 | public function getUrlRead($url){ | 159 | public function getUrlRead($url){ |
| 160 | - $contextOptions = [ | ||
| 161 | - 'ssl' => [ | ||
| 162 | - 'verify_peer' => false, | ||
| 163 | - 'verify_peer_name' => false, | ||
| 164 | - ], | ||
| 165 | - ]; | ||
| 166 | - $context = stream_context_create($contextOptions); | ||
| 167 | - $sourceCode = file_get_contents($url, false, $context); | ||
| 168 | - if(!$sourceCode){ | ||
| 169 | - $this->fail('当前url不存在'); | ||
| 170 | - } | ||
| 171 | - // 过滤掉具有 "change-language-cont" 类的元素 | ||
| 172 | - $pattern = '/<div\b[^>]*\sclass=[\'"]([^\'"]*change-language-cont[^\'"]*)[\'"][^>]*>(.*?)<\/div>/is'; | ||
| 173 | - $sourceCode = preg_replace($pattern, '', $sourceCode); | ||
| 174 | - $pattern = '/<div\b[^>]*\sclass=[\'"]([^\'"]*change-language-title[^\'"]*)[\'"][^>]*>(.*?)<\/div>/is'; | ||
| 175 | - $sourceCode = preg_replace($pattern, '', $sourceCode); | ||
| 176 | - $pattern = '/<style\b[^>]*>(.*?)<\/style>/s'; // 定义匹配`<style>`标签及其内容的正则表达式 | ||
| 177 | - $strippedContent = preg_replace($pattern, '', $sourceCode); // 删除`<style>`标签及其内容 | ||
| 178 | - $pattern = '/<script\b[^>]*>(.*?)<\/script>/s'; // 定义匹配`<script>`标签及其内容的正则表达式 | ||
| 179 | - $strippedContent = preg_replace($pattern, '', $strippedContent); // 删除`<script>`标签及其内容 | ||
| 180 | - $pattern = '/<link\b[^>]*>/'; // 定义匹配 `<link>` 标签的正则表达式 | ||
| 181 | - $strippedContent = preg_replace($pattern, '', $strippedContent); // 删除 `<link>` 标签 | ||
| 182 | - $pattern = '/>([^<]+)</'; // 定义匹配中间内容不是标签的正则表达式 | ||
| 183 | - $matches = array(); | ||
| 184 | - preg_match_all($pattern, $strippedContent, $matches); | ||
| 185 | - $textContentArray = array_filter($matches[1], function($item) { | ||
| 186 | - return !empty(trim($item)); | ||
| 187 | - }); | ||
| 188 | - // 过滤掉包含逗号加换行的内容 | ||
| 189 | - $textContentArray = array_filter($textContentArray, function($item) { | ||
| 190 | - return strpos($item, ',') === false && strpos($item, PHP_EOL) === false; | ||
| 191 | - }); | ||
| 192 | - $contentData = []; | ||
| 193 | - foreach ($textContentArray as $v){ | ||
| 194 | - $content = trim($v); | ||
| 195 | - $trimmedString = preg_replace('/\s+/', ' ', $content); | ||
| 196 | - $contentData[] = $trimmedString; | ||
| 197 | - } | ||
| 198 | - $textContentArray = array_unique($textContentArray); | ||
| 199 | - $contentData = array_values($textContentArray); | ||
| 200 | - $pattern = '/<meta\s+[^>]*name=[\'"](keywords|description)[\'"][^>]*content=[\'"]([^\'"]+)[\'"]>/i'; // 匹配 name 为 "keywords" 或 "description" 的 meta 标签的正则表达式 | ||
| 201 | - $matches = array(); | ||
| 202 | - preg_match_all($pattern, $strippedContent, $matches); | ||
| 203 | - $metaData = array(); | ||
| 204 | - foreach ($matches[2] as $index => $content) { | ||
| 205 | - if(!empty(trim($content))){ | ||
| 206 | - $metaData[] = $content; | ||
| 207 | - } | ||
| 208 | - } | ||
| 209 | - $data = array_merge($metaData, $contentData); | 160 | + $dom = file_get_html($url); |
| 161 | + $texts = $dom->find("text"); | ||
| 162 | + $description = $dom->find("meta[name=description]",0); | ||
| 163 | + $keywords = $dom->find("meta[name=keywords]",0); | ||
| 164 | + // 组装需要翻译的内容 HTML内文案、meta description、meta keywords | ||
| 165 | + $need_tran = []; | ||
| 166 | + foreach ($texts as $k=>$text) { | ||
| 167 | + $tag= $text->parent()->tag; | ||
| 168 | + if (in_array($tag, ['script', 'style', 'root'])){ | ||
| 169 | + continue; | ||
| 170 | + } | ||
| 171 | + $string = trim($text->text()); | ||
| 172 | + if (empty($string)){ | ||
| 173 | + continue; | ||
| 174 | + } | ||
| 175 | + $country_class = ''; | ||
| 176 | + if (method_exists($text->parent()->parent(),"find") && $text->parent()->parent()->find("b")) { | ||
| 177 | + $country_class = $text->parent()->parent()->find("b",0)->class; | ||
| 178 | + } | ||
| 179 | + if(FALSE !== strpos($country_class, 'country-flag')) { | ||
| 180 | + continue; | ||
| 181 | + } | ||
| 182 | + $need_tran[] = htmlspecialchars_decode(html_entity_decode($string)); | ||
| 183 | + } | ||
| 184 | + $need_tran[] = $description->attr['content']; | ||
| 185 | + $need_tran[] = $keywords->attr['content']; | ||
| 186 | + return $need_tran; | ||
| 210 | return $data; | 187 | return $data; |
| 211 | } | 188 | } |
| 212 | 189 |
-
请 注册 或 登录 后发表评论