/*
* @(#)ParserImpl.java 1.11 2000/08/16
*
*/
package org.w3c.tidy;
/**
*
* HTML Parser implementation
*
* (c) 1998-2000 (W3C) MIT, INRIA, Keio University
* See Tidy.java for the copyright notice.
* Derived from <a href="http://www.w3.org/People/Raggett/tidy">
* HTML Tidy Release 4 Aug 2000</a>
*
* @author Dave Raggett <dsr@w3.org>
* @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
* @version 1.0, 1999/05/22
* @version 1.0.1, 1999/05/29
* @version 1.1, 1999/06/18 Java Bean
* @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
* @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
* @version 1.4, 1999/09/04 DOM support
* @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
* @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
* @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
* @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
* @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
* @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
* @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
*/
public class ParserImpl {
//private static int SeenBodyEndTag; /* AQ: moved into lexer structure */
private static void parseTag(Lexer lexer, Node node, short mode)
{
// Local fix by GLP 2000-12-21. Need to reset insertspace if this
// is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
// Remove this code once the fix is made in Tidy.
/****** (Original code follows)
if ((node.tag.model & Dict.CM_EMPTY) != 0)
{
lexer.waswhite = false;
return;
}
else if (!((node.tag.model & Dict.CM_INLINE) != 0))
lexer.insertspace = false;
*******/
if (!((node.tag.model & Dict.CM_INLINE) != 0))
lexer.insertspace = false;
if ((node.tag.model & Dict.CM_EMPTY) != 0)
{
lexer.waswhite = false;
return;
}
if (node.tag.parser == null || node.type == Node.StartEndTag)
return;
node.tag.parser.parse(lexer, node, mode);
}
private static void moveToHead(Lexer lexer, Node element, Node node)
{
Node head;
TagTable tt = lexer.configuration.tt;
if (node.type == Node.StartTag || node.type == Node.StartEndTag)
{
Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
while (element.tag != tt.tagHtml)
element = element.parent;
for (head = element.content; head != null; head = head.next)
{
if (head.tag == tt.tagHead)
{
Node.insertNodeAtEnd(head, node);
break;
}
}
if (node.tag.parser != null)
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
else
{
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
}
}
public static class ParseHTML implements Parser {
public void parse( Lexer lexer, Node html, short mode )
{
Node node, head;
Node frameset = null;
Node noframes = null;
lexer.configuration.XmlTags = false;
lexer.seenBodyEndTag = 0;
TagTable tt = lexer.configuration.tt;
for (;;)
{
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null)
{
node = lexer.inferredTag("head");
break;
}
if (node.tag == tt.tagHead)
break;
if (node.tag == html.tag && node.type == Node.EndTag)
{
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* deal with comments etc. */
if (Node.insertMisc(html, node))
continue;
lexer.ungetToken();
node = lexer.inferredTag("head");
break;
}
head = node;
Node.insertNodeAtEnd(html, head);
getParseHead().parse(lexer, head, mode);
for (;;)
{
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null)
{
if (frameset == null) /* create an empty body */
node = lexer.inferredTag("body");
return;
}
/* robustly handle html tags */
if (node.tag == html.tag)
{
if (node.type != Node.StartTag && frameset == null)
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* deal with comments etc. */
if (Node.insertMisc(html, node))
continue;
/* if frameset document coerce <body> to <noframes> */
if (node.tag == tt.tagBody)
{
if (node.type != Node.StartTag)
{
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset != null)
{
lexer.ungetToken();
if (noframes == null)
{
noframes = lexer.inferredTag("noframes");
Node.insertNodeAtEnd(frameset, noframes);
Report.warning(lexer, html, noframes, Report.INSERTING_TAG);
}
parseTag(lexer, noframes, mode);
continue;
}
break; /* to parse body */
}
/* flag an error if we see more than one frameset */
if (node.tag == tt.tagFrameset)
{
if (node.type != Node.StartTag)
{
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset != null)
Report.error(lexer, html, node, Report.DUPLICATE_FRAMESET);
else
frameset = node;
Node.insertNodeAtEnd(html, node);
parseTag(lexer, node, mode);
/*
see if it includes a noframes element so
that we can merge subsequent noframes elements
*/
for (node = frameset.content; node != null; node = node.next)
{
if (node.tag == tt.tagNoframes)
noframes = node;
}
continue;
}
/* if not a frameset document coerce <noframes> to <body> */
if (node.tag == tt.tagNoframes)
{
if (node.type != Node.StartTag)
{
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset == null)
{
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
node = lexer.inferredTag("body");
break;
}
if (noframes == null)
{
noframes = node;
Node.insertNodeAtEnd(frameset, noframes);
}
parseTag(lexer, noframes, mode);
没有合适的资源?快使用搜索试试~ 我知道了~
jtidy-04aug2000r7-dev.zip_ jtidy-r938-sources_JTidy-lizi_jtidy _
共214个文件
html:118个
java:80个
txt:3个
1.该资源内容由用户上传,如若侵权请联系客服进行举报
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
版权申诉
0 下载量 140 浏览量
2022-09-14
15:28:20
上传
评论
收藏 702KB ZIP 举报
温馨提示
JTidy的Jar包,用于清洗Html网页并可以将其转换为相应的Xml或是Xhtml文件。
资源推荐
资源详情
资源评论
收起资源包目录
jtidy-04aug2000r7-dev.zip_ jtidy-r938-sources_JTidy-lizi_jtidy _ (214个子文件)
build.bat 1KB
build.bat 1KB
stylesheet.css 1KB
Makefile.dos 36KB
jtidy.fil 9KB
tidy.gif 244B
index-all.html 251KB
Tidy.html 81KB
Report.html 56KB
Lexer.html 54KB
DOMAttrImpl.html 51KB
Node.html 49KB
Node.html 43KB
DOMNodeImpl.html 43KB
DOMDocumentImpl.html 43KB
TagTable.html 43KB
DOMElementImpl.html 42KB
Document.html 40KB
Configuration.html 38KB
Element.html 35KB
ParserImpl.html 30KB
Dict.html 28KB
PPrint.html 28KB
DOMCharacterDataImpl.html 26KB
DOMDocumentTypeImpl.html 24KB
AttributeTable.html 23KB
DOMTextImpl.html 22KB
DOMCDATASectionImpl.html 21KB
CharacterData.html 21KB
DOMProcessingInstructionImpl.html 20KB
package-summary.html 20KB
DOMCommentImpl.html 20KB
DOMException.html 19KB
DOMAttrMapImpl.html 19KB
CheckAttribsImpl.html 19KB
overview-tree.html 19KB
package-tree.html 19KB
NamedNodeMap.html 18KB
StreamIn.html 18KB
AttVal.html 18KB
Attr.html 17KB
DocumentType.html 16KB
Clean.html 16KB
AttrCheckImpl.html 14KB
Text.html 14KB
Entity.html 14KB
CDATASection.html 13KB
DOMImplementation.html 13KB
ProcessingInstruction.html 12KB
StreamInImpl.html 12KB
Notation.html 12KB
DocumentFragment.html 11KB
Attribute.html 11KB
DOMNodeListByTagNameImpl.html 11KB
Comment.html 11KB
Style.html 11KB
EntityTable.html 11KB
serialized-form.html 11KB
EntityReference.html 11KB
Out.html 10KB
StyleProp.html 10KB
DOMNodeListImpl.html 10KB
DOMExceptionImpl.html 10KB
OutImpl.html 10KB
IStack.html 10KB
allclasses-frame.html 10KB
package-summary.html 9KB
Entity.html 9KB
MutableObject.html 9KB
AttrCheckImpl.CheckScript.html 9KB
AttrCheckImpl.CheckName.html 9KB
AttrCheckImpl.CheckBool.html 9KB
AttrCheckImpl.CheckValign.html 9KB
AttrCheckImpl.CheckUrl.html 9KB
AttrCheckImpl.CheckAlign.html 9KB
AttrCheckImpl.CheckId.html 9KB
CheckAttribsImpl.CheckCaption.html 9KB
CheckAttribsImpl.CheckSCRIPT.html 9KB
CheckAttribsImpl.CheckTABLE.html 9KB
CheckAttribsImpl.CheckTableCell.html 9KB
CheckAttribsImpl.CheckSTYLE.html 9KB
CheckAttribsImpl.CheckAREA.html 9KB
CheckAttribsImpl.CheckAnchor.html 9KB
CheckAttribsImpl.CheckLINK.html 9KB
CheckAttribsImpl.CheckHTML.html 9KB
CheckAttribsImpl.CheckMap.html 9KB
ParserImpl.ParseNoFrames.html 9KB
ParserImpl.ParseColGroup.html 9KB
ParserImpl.ParseFrameSet.html 9KB
CheckAttribsImpl.CheckIMG.html 9KB
ParserImpl.ParseRowGroup.html 9KB
ParserImpl.ParseOptGroup.html 9KB
ParserImpl.ParseTableTag.html 9KB
CheckAttribsImpl.CheckHR.html 9KB
ParserImpl.ParseDefList.html 9KB
ParserImpl.ParseScript.html 9KB
ParserImpl.ParseSelect.html 9KB
ParserImpl.ParseInline.html 9KB
ParserImpl.ParseText.html 9KB
ParserImpl.ParseList.html 9KB
共 214 条
- 1
- 2
- 3
资源评论
我虽横行却不霸道
- 粉丝: 72
- 资源: 1万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功