package test;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.web.util.HtmlUtils;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HtmlToWord {
static String savePath = "D:/Confluence";//文件保存位置
static String scanPath = "C:/Users/Administrator/Desktop/Confluence-space-export-094133-228.html/service/";//要扫描的文件所在文件夹
/**
* 把输入流里面的内容以UTF-8编码当文本取出。 不考虑异常,直接抛出
*
* @param ises
* @return
* @throws IOException
*/
private String getContent(InputStream... ises) throws IOException {
if (ises != null) {
StringBuilder result = new StringBuilder();
BufferedReader br;
String line;
for (InputStream is : ises) {
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
while ((line = br.readLine()) != null) {
result.append(line);
}
}
return result.toString();
}
return null;
}
public static String writeWordFile(String content) {
String path = savePath;
Map<String, Object> param = new HashMap<String, Object>();
if (!"".equals(path)) {
File fileDir = new File(path);
if (!fileDir.exists()) {
fileDir.mkdirs();
}
content = HtmlUtils.htmlUnescape(content);
HashMap<String, Object> imgsornm = getImgStr(content);
List<HashMap<String, String>> imgs = (List<HashMap<String, String>>)imgsornm.get("list");
int count = 0;
for (HashMap<String, String> img : imgs) {
count++;
//处理替换以“/>”结尾的img标签
content = content.replace(img.get("img"), "${imgReplace" + count + "}");
//处理替换以“>”结尾的img标签
content = content.replace(img.get("img1"), "${imgReplace" + count + "}");
Map<String, Object> header = new HashMap<String, Object>();
String imagePath = img.get("src");
String[] split = imagePath.split(":");
InputStream inputStream = null;
if(split != null && split.length >3){
HttpURLConnection httpURLConnection = null;
try {
URL url = new URL(imagePath);
httpURLConnection = (HttpURLConnection) url.openConnection();// 设置网络连接超时时间
httpURLConnection.setConnectTimeout(3000);// 设置应用程序要从网络连接读取数据
httpURLConnection.setDoInput(true);
httpURLConnection.setRequestMethod("GET");
int responseCode = httpURLConnection.getResponseCode();
if (responseCode == 200) {// 从服务器返回一个输入流
inputStream = httpURLConnection.getInputStream();
}
} catch (Exception e) {
e.printStackTrace();
}
} else {
try {
String imageispath = "C:/Users/Administrator/Desktop/Confluence-space-export-094133-228.html/service/" + imagePath;
inputStream = new FileInputStream(imageispath);
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
try {
//如果没有宽高属性,默认设置为400*300
if (img.get("width") == null || img.get("height") == null) {
header.put("width", 400);
header.put("height", 300);
} else {
header.put("width", (int) (Double.parseDouble(img.get("width"))));
header.put("height", (int) (Double.parseDouble(img.get("height"))));
}
header.put("type", "jpg");
header.put("content", OfficeUtil.inputStream2ByteArray(inputStream, true));
} catch (Exception e) {
e.printStackTrace();
}
param.put("${imgReplace" + count + "}", header);
}
try {
// 生成doc格式的word文档,需要手动改为docx
byte by[] = content.getBytes("UTF-8");
ByteArrayInputStream bais = new ByteArrayInputStream(by);
POIFSFileSystem poifs = new POIFSFileSystem();
DirectoryEntry directory = poifs.getRoot();
DocumentEntry documentEntry = directory.createDocument("WordDocument", bais);
FileOutputStream ostream = new FileOutputStream(savePath + "/temp.doc");
poifs.writeFilesystem(ostream);
bais.close();
ostream.close();
JacobUtil.wordConveter(savePath + "/temp.doc");
// 临时文件(手动改好的docx文件)
CustomXWPFDocument doc = OfficeUtil.generateWord(param, savePath + "/temp.docx");
//最终生成的带图片的word文件
System.out.println();
//处理h1标签作为文件名時的特殊字符
String namestr = (String)imgsornm.get("title");
String nm = RegExString(namestr);//去除文件名的特殊字符
FileOutputStream fopts = new FileOutputStream(savePath + "/"+ nm +".docx");
doc.write(fopts);
fopts.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return "save is OK";
}
//获取html中的图片元素信息
public static HashMap<String, Object> getImgStr(String htmlStr) {
HashMap<String, Object> rnmap = new HashMap<String, Object>();
List<HashMap<String, String>> pics = new ArrayList<HashMap<String, String>>();
Document doc = Jsoup.parse(htmlStr);
Elements imgs = doc.select("img");
//读取html中h1标签用来作为文件名开始-------
Elements title = doc.select("h1");
String text = title.text();
HashMap<String, String> mapnm = new HashMap<String, String>();
rnmap.put("title", text);
//读取html中h1标签用来作为文件名结束-------
for (Element img : imgs) {
System.out.println(img.attr("width").length());
HashMap<String, String> map = new HashMap<String, String>();
if (!"".equals(img.attr("width")) /*&& img.attr("width").length() >= 2*/) {
map.put("width", img.attr("width").substring(0, img.attr("width").length()));
}
if (!"".equals(img.attr("height")) /*&& img.attr("height").length() >= 2*/) {
map.put("height", img.attr("height").substring(0, img.attr("height").length()));
}
map.put("img", img.toString().substring(0, img.toString().length() - 1) + "/>");
map.put("img1", img.toString());
map.put("src", img.attr("src"));
pics.add(map);
}
没有合适的资源?快使用搜索试试~ 我知道了~
html转Word(可批量转化但速度较慢,支持图片显示,图片在本地或网络都行)
共35个文件
jar:21个
class:4个
java:4个
1星 需积分: 49 46 下载量 12 浏览量
2019-05-08
15:53:13
上传
评论 3
收藏 34.75MB ZIP 举报
温馨提示
html转word,之前找过一些但支持的不够全,自己在原有的基础上添加了部分通用功能,
资源推荐
资源详情
资源评论
收起资源包目录
test.zip (35个子文件)
test
bin
test
CustomXWPFDocument.class 5KB
JacobUtil.class 3KB
OfficeUtil.class 7KB
HtmlToWord.class 9KB
.settings
org.eclipse.core.resources.prefs 57B
org.eclipse.jdt.core.prefs 598B
src
test
HtmlToWord.java 9KB
OfficeUtil.java 7KB
CustomXWPFDocument.java 4KB
JacobUtil.java 4KB
.project 380B
WebRoot
META-INF
MANIFEST.MF 39B
WEB-INF
classes
lib
org.apache.poi.xwpf.converter.core-1.0.1.jar 150KB
poi-scratchpad-3.14.jar 1.26MB
xmlbeans-2.6.0.jar 2.6MB
commons-codec-1.10.jar 278KB
poi-3.14.jar 2.41MB
commons-collections4-4.1.jar 734KB
spring-core-3.2.6.RELEASE.jar 850KB
poi-ooxml-3.17.jar 1.41MB
poi-3.17.jar 2.58MB
jsoup-1.11.3.jar 386KB
ooxml-schemas-1.3.jar 14.84MB
poi-ooxml-3.14.jar 1.23MB
poi-ooxml-schemas-3.17.jar 5.65MB
jacob.jar 48KB
poi-ooxml-schemas-3.14.jar 5.65MB
xdocreport-1.0.6.jar 1.09MB
spring-web-3.2.6.RELEASE.jar 613KB
org.apache.poi.xwpf.converter.xhtml-1.0.0.jar 49KB
curvesapi-1.03.jar 90KB
poi-scratchpad-3.17.jar 1.33MB
stax-api-1.0.1.jar 26KB
index.jsp 834B
.classpath 2KB
共 35 条
- 1
资源评论
- qq_344610632020-03-01骗人的 从别的地方复制的,
qq_39624196
- 粉丝: 0
- 资源: 1
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功