<?php
/**
* Website: http://sourceforge.net/projects/simplehtmldom/
* Additional projects: http://sourceforge.net/projects/debugobject/
* Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
* Contributions by:
* Yousuke Kumakura (Attribute filters)
* Vadim Voituk (Negative indexes supports of "find" method)
* Antcs (Constructor with automatically load contents either text or file/url)
*
* all affected sections have comments starting with "PaperG"
*
* Paperg - Added case insensitive testing of the value of the selector.
*
* Paperg - Added tag_start for the starting index of tags - NOTE: This works
* but not accurately. This tag_start gets counted AFTER \r\n have been crushed
* out, and after the remove_noice calls so it will not reflect the REAL
* position of the tag in the source, it will almost always be smaller by some
* amount. We use this to determine how far into the file the tag in question
* is. This "percentage" will never be accurate as the $dom->size is the "real"
* number of bytes the dom was created from. But for most purposes, it's a
* really good estimation.
*
* Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
* closed is great for malformed html, but it CAN lead to parsing errors.
*
* Allow the user to tell us how much they trust the html.
*
* Paperg add the text and plaintext to the selectors for the find syntax.
* plaintext implies text in the innertext of a node. text implies that the
* tag is a text node. This allows for us to find tags based on the text they
* contain.
*
* Create find_ancestor_tag to see if a tag is - at any level - inside of
* another specific tag.
*
* Paperg: added parse_charset so that we know about the character set of
* the source document. NOTE: If the user's system has a routine called
* get_last_retrieve_url_contents_content_type availalbe, we will assume it's
* returning the content-type header from the last transfer or curl_exec, and
* we will parse that and use it in preference to any other method of charset
* detection.
*
* Found infinite loop in the case of broken html in restore_noise. Rewrote to
* protect from that.
*
* PaperG (John Schlick) Added get_display_size for "IMG" tags.
*
* Licensed under The MIT License
* Redistributions of files must retain the above copyright notice.
*
* @author S.C. Chen <me578022@gmail.com>
* @author John Schlick
* @author Rus Carroll
* @version Rev. 1.8.1 (247)
* @package PlaceLocalInclude
* @subpackage simple_html_dom
*/
/**
* All of the Defines for the classes below.
* @author S.C. Chen <me578022@gmail.com>
*/
define('HDOM_TYPE_ELEMENT', 1);
define('HDOM_TYPE_COMMENT', 2);
define('HDOM_TYPE_TEXT', 3);
define('HDOM_TYPE_ENDTAG', 4);
define('HDOM_TYPE_ROOT', 5);
define('HDOM_TYPE_UNKNOWN', 6);
define('HDOM_QUOTE_DOUBLE', 0);
define('HDOM_QUOTE_SINGLE', 1);
define('HDOM_QUOTE_NO', 3);
define('HDOM_INFO_BEGIN', 0);
define('HDOM_INFO_END', 1);
define('HDOM_INFO_QUOTE', 2);
define('HDOM_INFO_SPACE', 3);
define('HDOM_INFO_TEXT', 4);
define('HDOM_INFO_INNER', 5);
define('HDOM_INFO_OUTER', 6);
define('HDOM_INFO_ENDSPACE', 7);
/** The default target charset */
defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
/** The default <br> text used instead of <br> tags when returning text */
defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
/** The default <span> text used instead of <span> tags when returning text */
defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
/** The maximum file size the parser should load */
defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
/** Contents between curly braces "{" and "}" are interpreted as text */
define('HDOM_SMARTY_AS_TEXT', 1);
// helper functions
// -----------------------------------------------------------------------------
// get html dom from file
// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
function file_get_html(
$url,
$use_include_path = false,
$context = null,
$offset = 0,
$maxLen = -1,
$lowercase = true,
$forceTagsClosed = true,
$target_charset = DEFAULT_TARGET_CHARSET,
$stripRN = true,
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT)
{
// Ensure maximum length is greater than zero
if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
// We DO force the tags to be terminated.
$dom = new simple_html_dom(
null,
$lowercase,
$forceTagsClosed,
$target_charset,
$stripRN,
$defaultBRText,
$defaultSpanText);
/**
* For sourceforge users: uncomment the next line and comment the
* retrieve_url_contents line 2 lines down if it is not already done.
*/
$contents = file_get_contents(
$url,
$use_include_path,
$context,
$offset,
$maxLen);
// Paperg - use our own mechanism for getting the contents as we want to
// control the timeout.
// $contents = retrieve_url_contents($url);
if (empty($contents) || strlen($contents) > $maxLen) { return false; }
// The second parameter can force the selectors to all be lowercase.
$dom->load($contents, $lowercase, $stripRN);
return $dom;
}
// get html dom from string
function str_get_html(
$str,
$lowercase = true,
$forceTagsClosed = true,
$target_charset = DEFAULT_TARGET_CHARSET,
$stripRN = true,
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT)
{
$dom = new simple_html_dom(
null,
$lowercase,
$forceTagsClosed,
$target_charset,
$stripRN,
$defaultBRText,
$defaultSpanText);
if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
$dom->clear();
return false;
}
$dom->load($str, $lowercase, $stripRN);
return $dom;
}
// dump html dom tree
function dump_html_tree($node, $show_attr = true, $deep = 0)
{
$node->dump($node);
}
/**
* simple html dom node
* PaperG - added ability for "find" routine to lowercase the value of the
* selector.
*
* PaperG - added $tag_start to track the start position of the tag in the total
* byte index
*
* @package PlaceLocalInclude
*/
class simple_html_dom_node
{
/**
* Node type
*
* Default is {@see HDOM_TYPE_TEXT}
*
* @var int
*/
public $nodetype = HDOM_TYPE_TEXT;
/**
* Tag name
*
* Default is 'text'
*
* @var string
*/
public $tag = 'text';
/**
* List of attributes
*
* @var array
*/
public $attr = array();
/**
* List of child node objects
*
* @var array
*/
public $children = array();
public $nodes = array();
/**
* The parent node object
*
* @var object|null
*/
public $parent = null;
// The "info" array - see HDOM_INFO_... for what each element contains.
public $_ = array();
/**
* Start position of the tag in the document
*
* @var int
*/
public $tag_start = 0;
/**
* The DOM object
*
* @var object|null
*/
private $dom = null;
/**
* Construct new node object
*
* Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
*/
function __construct($dom)
{
$this->dom = $dom;
$dom->nodes[] = $this;
}
function __destruct()
{
$this->clear();
}
function __toString()
{
return $this->outertext();
}
// clean up memory due to php5 circular references memory leak...
function clear()
{
$this->dom = null;
$this->nodes = null;
$this->parent = null;
$this->children = null;
}
// dump node's tree
function dump($show_attr = true, $deep = 0)
{
$lead = str_repeat(' ', $deep);
echo $lead . $this->tag;
if ($show_attr && count($this->attr) > 0) {
echo '(';
foreach ($this->attr as $k => $v) {
echo "[$k]=>\"" . $this->$k . '", ';
}
echo ')';
}
echo "\n";
if ($this->nodes) {
foreach ($this->nodes as $c) {
$c->dump($show_attr, $deep + 1);
}
}
}
// Debugging function to dump a single dom node with a bunch of information about it.
function dump_node($echo = true)
{
$string = $this->tag;
if (count($this->attr) > 0) {
$string .= '(';
foreach ($this->attr as $k => $v) {
$string
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
simplehtmldom_1_8_1.zip (23个子文件)
phpcompatibility.xml 454B
example
example_callback.php 577B
example_basic_selector.php 940B
simple_html_dom_utility.php 893B
example_extract_html.php 106B
example_advanced_selector.php 1KB
scraping
example_scraping_imdb.php 1KB
example_scraping_digg.php 1016B
example_scraping_slashdot.php 795B
example_scraping_general.php 1KB
example_modify_contents.php 361B
manual
img
tab.png 734B
js
jquery-1.2.3.pack.js 29KB
ui.tabs.pack.js 6KB
manual_api.htm 11KB
manual_faq.htm 5KB
index.htm 6KB
manual.htm 24KB
css
default.css 2KB
ui.tabs.css 4KB
simple_html_dom.php 73KB
phpcs.xml 2KB
CHANGELOG.md 9KB
共 23 条
- 1
资源评论
humdark
- 粉丝: 4
- 资源: 4
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功