package org.jsoup.parser;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.*;
import java.util.ArrayList;
/**
* The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states.
*/
enum HtmlTreeBuilderState {
Initial {
boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
return true; // ignore whitespace
} else if (t.isComment()) {
tb.insert(t.asComment());
} else if (t.isDoctype()) {
// todo: parse error check on expected doctypes
// todo: quirk state check on doctype ids
Token.Doctype d = t.asDoctype();
DocumentType doctype = new DocumentType(
tb.settings.normalizeTag(d.getName()), d.getPubSysKey(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri());
tb.getDocument().appendChild(doctype);
if (d.isForceQuirks())
tb.getDocument().quirksMode(Document.QuirksMode.quirks);
tb.transition(BeforeHtml);
} else {
// todo: check not iframe srcdoc
tb.transition(BeforeHtml);
return tb.process(t); // re-process token
}
return true;
}
},
BeforeHtml {
boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isDoctype()) {
tb.error(this);
return false;
} else if (t.isComment()) {
tb.insert(t.asComment());
} else if (isWhitespace(t)) {
return true; // ignore whitespace
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
tb.insert(t.asStartTag());
tb.transition(BeforeHead);
} else if (t.isEndTag() && (StringUtil.in(t.asEndTag().normalName(), "head", "body", "html", "br"))) {
return anythingElse(t, tb);
} else if (t.isEndTag()) {
tb.error(this);
return false;
} else {
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
tb.insertStartTag("html");
tb.transition(BeforeHead);
return tb.process(t);
}
},
BeforeHead {
boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
return true;
} else if (t.isComment()) {
tb.insert(t.asComment());
} else if (t.isDoctype()) {
tb.error(this);
return false;
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
return InBody.process(t, tb); // does not transition
} else if (t.isStartTag() && t.asStartTag().normalName().equals("head")) {
Element head = tb.insert(t.asStartTag());
tb.setHeadElement(head);
tb.transition(InHead);
} else if (t.isEndTag() && (StringUtil.in(t.asEndTag().normalName(), "head", "body", "html", "br"))) {
tb.processStartTag("head");
return tb.process(t);
} else if (t.isEndTag()) {
tb.error(this);
return false;
} else {
tb.processStartTag("head");
return tb.process(t);
}
return true;
}
},
InHead {
boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insert(t.asCharacter());
return true;
}
switch (t.type) {
case Comment:
tb.insert(t.asComment());
break;
case Doctype:
tb.error(this);
return false;
case StartTag:
Token.StartTag start = t.asStartTag();
String name = start.normalName();
if (name.equals("html")) {
return InBody.process(t, tb);
} else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) {
Element el = tb.insertEmpty(start);
// jsoup special: update base the frist time it is seen
if (name.equals("base") && el.hasAttr("href"))
tb.maybeSetBaseUri(el);
} else if (name.equals("meta")) {
Element meta = tb.insertEmpty(start);
// todo: charset switches
} else if (name.equals("title")) {
handleRcData(start, tb);
} else if (StringUtil.in(name, "noframes", "style")) {
handleRawtext(start, tb);
} else if (name.equals("noscript")) {
// else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript)
tb.insert(start);
tb.transition(InHeadNoscript);
} else if (name.equals("script")) {
// skips some script rules as won't execute them
tb.tokeniser.transition(TokeniserState.ScriptData);
tb.markInsertionMode();
tb.transition(Text);
tb.insert(start);
} else if (name.equals("head")) {
tb.error(this);
return false;
} else {
return anythingElse(t, tb);
}
break;
case EndTag:
Token.EndTag end = t.asEndTag();
name = end.normalName();
if (name.equals("head")) {
tb.pop();
tb.transition(AfterHead);
} else if (StringUtil.in(name, "body", "html", "br")) {
return anythingElse(t, tb);
} else {
tb.error(this);
return false;
}
break;
default:
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, TreeBuilder tb) {
tb.processEndTag("head");
return tb.process(t);
}
},
InHeadNoscript {
boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isDoctype()) {
tb.error(this);
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
return tb.process(t, InBody);
} else if (t.isEndTag() && t.asEndTag().normalName().equals("noscript")) {
tb.pop();
tb.transition(InHead);
} else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().normalName(),
"basefont", "bgsound", "link", "meta", "noframes", "style"))) {
return tb.process(t, InHead);
} else if (t.isEndTag() && t.asEndTag().normalName().equals("br")) {
return anythingElse(t, tb);
} else if ((t.isStartTag() && StringUtil.in(t.asStartTag().normalName(), "head", "noscript")) || t.isEndTag()) {
tb.error(this);
return false;
} else {
return anythingElse(t, tb);
}
return true;
}
private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
tb.error(this);
tb.insert(new Token.Character().data(t.toStrin