/*
** HTML Tree
** html.c
**
** Copyright (C) 1999 Paul J. Lucas
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
// standard
#include <cctype>
#include <cstring>
#include <string>
// local
#include "Comment_Node.h"
#include "Content_Node.h"
#include "Element_Node.h"
#include "fake_ansi.h"
#include "html.h"
#include "Text_Node.h"
#include "util.h"
#ifndef PJL_NO_NAMESPACES
using namespace std;
namespace HTML_Tree {
#endif
int const Tag_Name_Max_Size = 10;
// The maximum size of an HTML tag name, e.g., "BLOCKQUOTE". You
// might need to increase this if you are indexing HTML documents
// that contain non-standard tags and at least one of them is
// longer than the above.
void parse_html_comment( char const *&pos, char const *end );
void parse_html_tag( char const *&pos, char const *end, Content_Node*& );
void skip_html_tag( char const *&pos, char const *end );
//*****************************************************************************
//
// SYNOPSIS
//
Root_Node* html_parse(
register char const *c, char const *end,
bool include_comments
)
//
// DESCRIPTION
//
// Parse the HTML file to build a tree.
//
// PARAMETERS
//
// c The pointer to the beginning of the buffer.
//
// end The pointer to one past the end of the buffer.
//
// RETURN VALUE
//
// Returns a pointer to the root node of the tree.
//
//*****************************************************************************
{
Root_Node *const root_node = new Root_Node;
Content_Node *cur_node = root_node;
while ( c != end ) {
char const *const b = c;
if ( *c++ == '<' ) {
if ( c == end )
break;
if ( *c != '!' ) {
//
// It must be an HTML tag.
//
parse_html_tag( c, end, cur_node );
continue;
}
if ( ++c == end )
break;
if ( *c == '-' && c + 1 != end && c[1] == '-' ) {
//
// It's a comment.
//
parse_html_comment( c += 2, end );
if ( include_comments )
new Comment_Node( b, c, cur_node );
continue;
}
//
// This is something like <!DOCUMENT
//
skip_html_tag( c, end );
goto new_text_node;
}
//
// Collect a run of text into a Text_Node.
//
for ( ; c != end; ++c )
if ( *c == '<' ) {
//
// We've encountered the potential beginning of
// an HTML tag: stop collecting text and create
// a new Text_Node containing what we've
// collected so far.
//
break;
}
new_text_node:
new Text_Node( b, c, cur_node );
}
if ( root_node->empty() ) {
delete root_node;
return 0;
}
return root_node;
}
//*****************************************************************************
//
// SYNOPSIS
//
void parse_attributes(
register char const *c, char const *end,
Element_Node::attribute_map &attributes
)
//
// DESCRIPTION
//
// Parse out all the attributes and their values of an HTML element.
//
// PARAMETERS
//
// c The iterator marking the beginning of where to start
// parsing.
//
// end The iterator marking the end of where to stop parsing
// (usually positioned at the closing '>' character of the
// HTML tag).
//
// attributes Where to deposit the attributes.
//
// SEE ALSO
//
// Dave Raggett, Arnaud Le Hors, and Ian Jacobs. "On SGML and HTML: SGML
// constructs used in HTML: Attributes," HTML 4.0 Specification, section
// 3.2.2, World Wide Web Consortium, April 1998.
// http://www.w3.org/TR/PR-html40/intro/sgmltut.html#h-3.2.2
//
//*****************************************************************************
{
while ( c != end && !is_space( *c++ ) ) ; // skip element name
while ( c != end ) {
if ( !isalpha( *c ) ) {
++c;
continue;
}
//
// We just found the start of a potential attribute name: now
// try to find its end by stopping at one of 'end', whitespace,
// or '='. To be robust, we allow whitespace around the '='.
//
char const *const name_begin = c;
while ( c != end && !isspace( *c ) && *c != '=' )
++c;
char const *const name_end = c;
while ( c != end && isspace( *c ) )
++c;
string const name( to_lower( name_begin, name_end ) );
if ( c == end || *c != '=' ) {
//
// It's a Boolean attribute: set its value to be its
// own name (per the HTML 4.0 spec).
//
attributes[ name ] = name;
continue;
}
if ( c == end )
break;
// Here, 'c' is positioned at the '='.
while ( ++c != end && isspace( *c ) )
;
if ( c == end )
break;
//
// Determine the span of the attribute's value: if it started
// with a quote, it's terminated only by the matching closing
// quote; if not, it's terminated by a whitespace character (or
// running into 'end').
//
// This is more lenient than the HTML 4.0 specification in that
// it allows non-quoted values to contain characters other than
// the set [A-Za-z0-9.-], i.e., any non-whitespace character.
//
char const quote = ( *c == '"' || *c == '\'' ) ? *c : 0;
if ( quote && ++c == end )
break;
char const *const b = c;
for ( ; c != end; ++c )
if ( quote ) { // stop at matching quote only
if ( *c == quote )
break;
} else if ( isspace( *c ) )
break; // stop at whitespace
attributes[ name ] = string( b, c );
if ( c == end )
break;
++c;
}
}
//*****************************************************************************
//
// SYNOPSIS
//
void parse_html_comment( register char const *&c, char const *end )
//
// DESCRIPTION
//
// Skip past an HTML comment by scanning for the closing "-->" character
// sequence. Unlike skipping an ordinary HTML tag, quotes are not
// significant and no attempt must be made either to "balance" them or to
// ignore what is in between them. The HTML specification permits
// whitespace between the "--" and the ">" (for some strange reason).
//
// This function is more lenient than the HTML 4.0 specification in that
// it allows for a string of hyphens within a comment since this is common
// in practice; the specification considers this to be an error.
//
// PARAMETERS
//
// c The iterator to use. It is presumed to start anywhere after
// the initial "<!--"; it is left positioned after the "-->".
//
// end The pointer to the end of the file.
//
// SEE ALSO
//
// Dave Raggett, Arnaud Le Hors, and Ian Jacobs. "On SGML and HTML: SGML
// constructs used in HTML: Comments," HTML 4.0 Specification, section
// 3.2.4, World Wide Web Consortium, April 1998.
// http://www.w3.org/TR/PR-html40/intro/sgmltut.html#h-3.2.4
//
//*****************************************************************************
{
while ( c != end )
if ( *c++ == '-' && c != end && *c == '-' ) {
char const *const d = c;
while ( ++c != end && is_space( *c ) )
;
if ( c == end || *c++ == '>' )
break;
c = d;
}
}
//*****************************************************************************
//
// SYNOPSIS
//
void parse_html_tag(
register char const *&c, register char const *end,
Content_Node *&cur_node
)
//
// DESCRIPTION
//
// This function does everything skip_html_tag() does but additionally
// builds a DOM-like (Document Object Model) tree of nodes. It does this
// by knowing when to end elements.
//
// PARAMETERS
//
// c The iterator to use. It must be positioned at the
// character after the '<'; it is repositioned at the
// first character after the '>'.
//
// end The iterator marking the end of the file.
//
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
HTML_Tree-2.4.3.rar (77个子文件)
HTML_Tree-2.4.3
version.h 958B
html.h 1KB
config
GNUmakefile 1KB
config-sh 3KB
man.mk 3KB
src
explicit.c 215B
namespaces.c 362B
sstream.c 121B
config.mk 4KB
GNUmakefile 2KB
prettyhtml.c 4KB
fake_ansi.h 2KB
man
man3
GNUmakefile 897B
element.3 2KB
Comment_Node.3 2KB
HTML_Node.3 9KB
Content_Node.3 7KB
Text_Node.3 3KB
Element_Node.3 4KB
GNUmakefile 981B
Element_Node.h 2KB
util.h 2KB
test.html 464B
mmap_file.h 4KB
html.c 12KB
Text_Node.h 2KB
HTML_Node.c 8KB
my_set.h 2KB
HTML_Node_iterator.c 3KB
elements.h 3KB
README 767B
mmap_file.c 5KB
INSTALL.README 3KB
util.c 3KB
Element_Node.c 5KB
install-sh 3KB
mod
HTML
MANIFEST 350B
Makefile.PL 849B
Tree
Tree.pm 15KB
perl_predicate.c 2KB
perl_predicate.h 2KB
blessed.h 2KB
typemap 2KB
MANIFEST 234B
util.h 3KB
container4perl.h 2KB
array2tree.c 4KB
perl_visitor.h 2KB
string4perl.h 1KB
Tree.xs 19KB
Makefile.PL 1KB
array_visitor.c 5KB
perl_visitor.c 4KB
array_visitor.h 2KB
test.pl 9KB
sv2tree.c 2KB
managed_ptr.h 3KB
Apache
MANIFEST 166B
HTML
MANIFEST 133B
ClassParser
MANIFEST 52B
ClassParser.pm 38KB
Makefile.PL 922B
test.pl 263B
Changes 138B
Makefile.PL 904B
Makefile.PL 880B
Content_Node.c 8KB
Copying 18KB
less.h 2KB
Comment_Node.h 2KB
Content_Node.h 4KB
elements.c 9KB
HTML_Tree.h 1016B
Changes 36KB
HTML_Node.h 3KB
Text_Node.c 3KB
HTML_Node_iterator.h 4KB
共 77 条
- 1
资源评论
APei
- 粉丝: 64
- 资源: 1万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功