<?php
define('HDOM_TYPE_ELEMENT', 1);
define('HDOM_TYPE_COMMENT', 2);
define('HDOM_TYPE_TEXT', 3);
define('HDOM_TYPE_ENDTAG', 4);
define('HDOM_TYPE_ROOT', 5);
define('HDOM_TYPE_UNKNOWN', 6);
define('HDOM_QUOTE_DOUBLE', 0);
define('HDOM_QUOTE_SINGLE', 1);
define('HDOM_QUOTE_NO', 3);
define('HDOM_INFO_BEGIN', 0);
define('HDOM_INFO_END', 1);
define('HDOM_INFO_QUOTE', 2);
define('HDOM_INFO_SPACE', 3);
define('HDOM_INFO_TEXT', 4);
define('HDOM_INFO_INNER', 5);
define('HDOM_INFO_OUTER', 6);
define('HDOM_INFO_ENDSPACE',7);
define('DEFAULT_TARGET_CHARSET', 'UTF-8');
define('DEFAULT_BR_TEXT', "\r\n");
define('DEFAULT_SPAN_TEXT', " ");
if (!defined('MAX_FILE_SIZE')) {
define('MAX_FILE_SIZE', 600000);
}
// helper functions
// -----------------------------------------------------------------------------
// get html dom from file
// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
{
// We DO force the tags to be terminated.
$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
do {
$repeat = false;
if ($context!==NULL)
{
// Test if "Accept-Encoding: gzip" has been set in $context
$params = stream_context_get_params($context);
if (isset($params['options']['http']['header']) && preg_match('/gzip/', $params['options']['http']['header']) !== false)
{
$contents = curl_file_get_contents('compress.zlib://'.$url, $use_include_path, $context, $offset);
}
else
{
$contents = curl_file_get_contents($url, $use_include_path, $context, $offset);
}
}
else
{
$contents = curl_file_get_contents($url, $use_include_path, NULL, $offset);
}
// test if the URL doesn't return a 200 status
if (isset($http_response_header) && strpos($http_response_header[0], '200') === false) {
// has a 301 redirect header been sent?
$pattern = "/^Location:\s*(.*)$/i";
$location_headers = preg_grep($pattern, $http_response_header);
if (!empty($location_headers) && preg_match($pattern, array_values($location_headers)[0], $matches)) {
// set the URL to that returned via the redirect header and repeat this loop
$url = $matches[1];
$repeat = true;
}
}
} while ($repeat);
// stop processing if the header isn't a good responce
if (isset($http_response_header) && strpos($http_response_header[0], '200') === false)
{
return false;
}
// stop processing if the contents are too big
if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
{
return false;
}
// The second parameter can force the selectors to all be lowercase.
$dom->load($contents, $lowercase, $stripRN);
return $dom;
}
// get html dom from string
function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
{
$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
if (empty($str) || strlen($str) > MAX_FILE_SIZE)
{
$dom->clear();
return false;
}
$dom->load($str, $lowercase, $stripRN);
return $dom;
}
// dump html dom tree
function dump_html_tree($node, $show_attr=true, $deep=0)
{
$node->dump($node);
}
/**
* simple html dom node
* PaperG - added ability for "find" routine to lowercase the value of the selector.
* PaperG - added $tag_start to track the start position of the tag in the total byte index
*
* @package PlaceLocalInclude
*/
class simple_html_dom_node
{
public $nodetype = HDOM_TYPE_TEXT;
public $tag = 'text';
public $attr = array();
public $children = array();
public $nodes = array();
public $parent = null;
// The "info" array - see HDOM_INFO_... for what each element contains.
public $_ = array();
public $tag_start = 0;
private $dom = null;
function __construct($dom)
{
$this->dom = $dom;
$dom->nodes[] = $this;
}
function __destruct()
{
$this->clear();
}
function __toString()
{
return $this->outertext();
}
// clean up memory due to php5 circular references memory leak...
function clear()
{
$this->dom = null;
$this->nodes = null;
$this->parent = null;
$this->children = null;
}
// dump node's tree
function dump($show_attr=true, $deep=0)
{
$lead = str_repeat(' ', $deep);
echo $lead.$this->tag;
if ($show_attr && count($this->attr)>0)
{
echo '(';
foreach ($this->attr as $k=>$v)
echo "[$k]=>\"".$this->$k.'", ';
echo ')';
}
echo "\n";
if ($this->nodes)
{
foreach ($this->nodes as $c)
{
$c->dump($show_attr, $deep+1);
}
}
}
// Debugging function to dump a single dom node with a bunch of information about it.
function dump_node($echo=true)
{
$string = $this->tag;
if (count($this->attr)>0)
{
$string .= '(';
foreach ($this->attr as $k=>$v)
{
$string .= "[$k]=>\"".$this->$k.'", ';
}
$string .= ')';
}
if (count($this->_)>0)
{
$string .= ' $_ (';
foreach ($this->_ as $k=>$v)
{
if (is_array($v))
{
$string .= "[$k]=>(";
foreach ($v as $k2=>$v2)
{
$string .= "[$k2]=>\"".$v2.'", ';
}
$string .= ")";
} else {
$string .= "[$k]=>\"".$v.'", ';
}
}
$string .= ")";
}
if (isset($this->text))
{
$string .= " text: (" . $this->text . ")";
}
$string .= " HDOM_INNER_INFO: '";
if (isset($node->_[HDOM_INFO_INNER]))
{
$string .= $node->_[HDOM_INFO_INNER] . "'";
}
else
{
$string .= ' NULL ';
}
$string .= " children: " . count($this->children);
$string .= " nodes: " . count($this->nodes);
$string .= " tag_start: " . $this->tag_start;
$string .= "\n";
if ($echo)
{
echo $string;
return;
}
else
{
return $string;
}
}
// returns the parent of node
// If a node is passed in, it will reset the parent of the current node to that one.
function parent($parent=null)
{
// I am SURE that this doesn't work properly.
// It fails to unset the current node from it's current parents nodes or children list first.
if ($parent !== null)
{
$this->parent = $parent;
$this->parent->nodes[] = $this;
$this->parent->children[] = $this;
}
return $this->parent;
}
// verify that node has children
function has_child()
{
return !empty($this->children);
}
// returns children of node
function children($idx=-1)
{
if ($idx===-1)
{
return $this->children;
}
if (isset($this->children[$idx])) r