用pdfbox的jar包来解析pdf:
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.OutputStreamWriter;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
public class Pdf2text {
public static String getTxt(File f) throws Exception {
String ts="";
try{
String temp = "";
PDDocument pdfdocument = PDDocument.load(f);
ByteArrayOutputStream out = new ByteArrayOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(out);
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfdocument.getDocument(), writer);
pdfdocument.close();
out.close();
writer.close();
byte[] contents = out.toByteArray();
ts = new String(contents);
System.out.println(f.getName() + "length is:" + contents.length + "\n");
}catch(Exception e){
e.printStackTrace();
}
finally{
return ts;
}
}
public static void main(String[] args){
File file = new File("E:/600536_2008_zzy.pdf");
try {
System.out.println(Pdf2text.getTxt(file));
} catch (Exception e) {
// TODO 自动生成 catch 块
e.printStackTrace();
}
}
}
======================
word,excel和ppt都用POI的jar包来解析:
import java.io.File;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
public class DocxParser {
/**
* @param args