import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.*;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import javax.activation.DataHandler;
import javax.activation.DataSource;
import javax.activation.MimetypesFileTypeMap;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Multipart;
import javax.mail.Session;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import javax.mail.internet.MimePartDataSource;
/**
*
* MHT文件解析类
*
*/
@SuppressWarnings("unchecked")
public class Html2MHTCompiler {
private URL strWeb = null;
/** 网页地址 */
private String strText = null;
/** 网页文本内容 */
private String strFileName = null;
/** 本地文件名 */
private String strEncoding = null;
/** 网页编码 */
// MHT格式附加信息
private String from = "thinkgem@gmail.com";
private String to;
private String subject;
private String cc;
private String bcc;
private String smtp = "localhost";
public static void main(String[] args) {
/* String strUrl = "http://192.168.1.2:8080/ibc/paper/?tid=29&pid=29&ptid=&s=&f=&ps=π=";
String strEncoding = "utf-8";
String strText1 = JQuery.getHtmlText(strUrl, strEncoding);
String strText2 = "<img src=\"http://www.imathas.com/cgi-bin/mimetex.cgi?sqrt{2}\"/><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" /><link rel=\"stylesheet\" type=\"text/css\" href=\"http://192.168.1.2:8080/ibc/theme/default/style.css\" /><P><div class=paper_list>sdfsdf<div class=paper>dfkjsldjfl<table><tr><td>abc</td><td>abc</td></tr><tr><td>abc</td><td>abc</td></tr></table></div></div><IMG SRC=\"http://192.168.1.13/cc.jpg\"/><SPAN>sdfsdf</SPAN></P><p><span style=\"font-size: 10pt; color:#f00;\"><font face=\"宋体\">在下列各溶液中,离子一定能大量共存的是<span lang=\"EN-US\"> ( )<o:p></o:p></span></font></span></p><p><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">A</span><span style=\"font-size: 10pt\">.强碱性溶液中:<span lang=\"EN-US\">K</span><sup>+</sup>、<span lang=\"EN-US\">S<sup>2-</sup></span>、<span lang=\"EN-US\">ClO</span><sup>-</sup>、<span lang=\"EN-US\">SO<sub>4</sub><sup>2</sup>< /span><sup>-</sup><span lang=\"EN-US\"> <o:p></o:p></span></span></font></p><p><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">B</span><span style=\"font-size: 10pt\">.含有<span lang=\"EN-US\">0.1mol</span></span></font><span style=\"font-family: "MS Mincho"; font-size: 10pt; mso-bidi-font-family: 'MS Mincho'\">?</span><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">L</span><sup><span style=\"font-size: 10pt\">-<span lang=\"EN-US\">1 </span></span></sup><span lang=\"EN-US\" style=\"font-size: 10pt\">Fe<sup>3</sup></span><sup><span style=\"font-size: 10pt\">+</span></sup><span style=\"font-size: 10pt\">的溶液中:<span lang=\"EN-US\">K</span><sup>+</sup>、<span lang=\"EN-US\">Mg<sup>2</sup></span><sup>+& lt;/sup>、<span lang=\"EN-US\">I</span><sup>-</sup>、<span lang=\"EN-US\">NO<sub>3</sub></span><sup>-& lt;/sup><span lang=\"EN-US\"><o:p></o:p></span></span></font></p><p><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">C</span><span style=\"font-size: 10pt\">.无色溶液中:<span lang=\"EN-US\">Na</span><sup>+</sup>、<span lang=\"EN-US\">K</span><sup>+</sup>、<span lang=\"EN-US\">CO<sub>3</sub><sup>2</sup>< /span><sup>-</sup>、<span lang=\"EN-US\">Cu<sup>2+</sup><o:p></o:p></span></span></font></p><p><font face=\"宋体\"><span lang=\"EN-US\" style=\"font-size: 10pt\">D</span><span style=\"font-size: 10pt\">.室温下,<span lang=\"EN-US\">pH</span>=<span lang=\"EN-US\">1</span>的溶液中:<span lang=\"EN-US\">Na</span><sup>+</sup>、<span lang=\"EN-US\">Fe<sup>3</sup></span><sup>+& lt;/sup>、<span lang=\"EN-US\">NO<sub>3</sub></span><sup>-& lt;/sup>、<span lang=\"EN-US\">SO<sub>4</sub><sup>2</sup>< /span><sup>-</sup><span lang=\"EN-US\"> <o:p></o:p></span></span></font></p>` ( sqrt{2} )/(2) `<p> </p><script type=\"text/javascript\" src=\"http://192.168.1.2:8080/ibc/manage/js/ASCIIMathMLwFallback2.js\"></script>";
Html2MHTCompiler h2t = new Html2MHTCompiler(strText2, strUrl, strEncoding, "c:\\test.mht"); */
//h2t.compile();
Html2MHTCompiler.mht2html("D:\\word\\resume.doc", "D:\\word\\HAHA.html");
}
/**
*
*
* 方法说明:初始化
*
*
* 输入参数:strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名
*
*
* 返回类型:
*
*/
public Html2MHTCompiler(String strText, String strUrl, String strEncoding,
String strFileName) {
try {
strWeb = new URL(strUrl);
} catch (MalformedURLException e) {
e.printStackTrace();
return;
}
this.strText = strText;
this.strEncoding = strEncoding;
this.strFileName = strFileName;
}
/**
*
*
* 方法说明:执行下载操作
*
*
* 输入参数:
*
*
* 返回类型:
*
*/
public boolean compile() {
if (strWeb == null || strText == null || strFileName == null
|| strEncoding == null)
return false;
HashMap urlMap = new HashMap();
NodeList nodes = new NodeList();
try {
Parser parser = createParser(strText);
parser.setEncoding(strEncoding);
// nodes = parser.parse(null);
} catch (ParserException e) {
e.printStackTrace();
}
extractAllScriptNodes(nodes);
ArrayList urlScriptList = extractAllScriptNodes(nodes, urlMap);
ArrayList urlImageList = extractAllImageNodes(nodes, urlMap);
for (Iterator iter = urlMap.entrySet().iterator(); iter.hasNext();) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String) entry.getKey();
String val = (String) entry.getValue();
strText = JHtmlClear.replace(strText, val, key);
}
try {
createMhtArchive(strText, urlScriptList, urlImageList);
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
/**
*
*
* 方法说明:建立HTML parser
*
*
* 输入参数:inputHTML 网页文本内容