package com.wangcc.wangccocrdemo001.ocr;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 读取所有拆分文件生成txt文件
* @author wangcc
* @createTime 2021年08月31日 23:51:00
*/
public class ReadAllSplitFile {
public static String fileName = "C:\\Users\\wangchenchen\\Desktop\\boot-structure\\outFile\\outPDFByMore\\";
public static String outPath = "C:\\Users\\wangchenchen\\Desktop\\boot-structure\\outFile\\readPdfFile.txt";
public static void main(String[] args) {
List<File> fileList = readAllFile();
List<String> pdfFileNameList = new ArrayList<>();
for (File file:fileList) {
pdfFileNameList.add(file.getName());
}
Collections.sort(pdfFileNameList, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
int n1 = extractNumber(o1);
int n2 = extractNumber(o2);
return n1 - n2;
}
});
File file = new File(outPath);
if(file.exists()){
file.delete();
}
for (String s:pdfFileNameList) {
try {
readFile(s);
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static List<File> readAllFile(){
String filePath = "C:\\Users\\wangchenchen\\Desktop\\boot-structure\\outFile\\outPDFByMore";
ArrayList<File> fileList = new ArrayList<>();
File file = new File(filePath);
File[] files = file.listFiles();
if(Objects.isNull(files)){
return null;
}
for (File f:files) {
if(f.isFile()){
fileList.add(f);
}
}
return fileList;
}
/**
* @Param orderStr 排序:asc,des,不区分大小写
**/
public static List<File> sortFileByName(List<File> fileList, final String orderStr){
if(!orderStr.equalsIgnoreCase("asc") && !orderStr.equalsIgnoreCase("desc")){
return fileList;
}
File[] files = fileList.toArray(new File[0]);
Arrays.sort(files, new Comparator<File>() {
@Override
public int compare(File o1, File o2) {
int n1 = extractNumber(o1.getName());
int n2 = extractNumber(o2.getName());
if(orderStr == null || orderStr.length() < 1 || orderStr.equalsIgnoreCase("asc")){
return n1 - n2;
}else {
//降序
return n2 - n1;
}
}
});
return new ArrayList<File>(Arrays.asList(files));
}
public static int extractNumber(String name){
int i;
try {
String s = name.replaceAll("[^\\d]", "");
i = Integer.parseInt(s);
}catch (Exception e){
i = 0;
}
return i;
}
public static void readFile(String path) throws IOException {
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(fileName+path);
PdfPageBase page;
StringBuilder sb = new StringBuilder();
Pattern pattern = Pattern.compile("(^(\\s*)第)(.{1,9})[章节卷集部篇回](\\s{1,10})(.{1,20})(\\s{1,10})");
Pattern pattern1 = Pattern.compile("(\\s{0,10})([0-9][0-9]?[0-9]?[0-9]?)");
//遍历PDF页面,获取每个页面的文本并添加到StringBuilder对象
for(int i = 0;i < pdf.getPages().getCount();i++) {
//System.out.println("循环遍历pdf页数:当前" + i + "页/" + pdf.getPages().getCount() + "页");
page = pdf.getPages().get(i);
int count = 0;
String extractText = null;
BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(page.extractText(true).getBytes())));
while ((extractText = br.readLine())!= null){
Matcher matcher = pattern.matcher(extractText);
Matcher matcher1 = pattern1.matcher(extractText);
/*末尾包含数字的*/
if (count != 0 || matcher.find()){
//System.out.println(extractText);
if(!extractText.equals("") && !matcher1.find()){
String s = extractText.replaceAll("\\s{5,9}", " ");
sb.append(s+"\n");
}
}
count++;
}
br.close();
}
FileWriter writer;
try {
//将StringBuilder对象中的文本写入到文本文件
writer = new FileWriter(outPath,true);
System.out.println(sb.toString());
writer.write(sb.toString());
writer.flush();
writer.close();
sb.delete(0,sb.length());
} catch (IOException e) {
e.printStackTrace();
}
pdf.close();
}
}
评论0