import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
public class StartPoi {
public static String textToreader(String strFilePath) {
String content = "";
String strPostFix = strFilePath.substring(strFilePath.lastIndexOf(".")+1);
if (strPostFix.equalsIgnoreCase("doc")) {
content = readWORD(strFilePath);
} else if (strPostFix.equalsIgnoreCase("docx")) {
content = readWORDX(strFilePath);
} else if (strPostFix.equalsIgnoreCase("xls")) {
content = readEXCEL(strFilePath);
} else if (strPostFix.equalsIgnoreCase("xlsx")) {
content = readEXCELX(strFilePath);
} else if (strPostFix.equalsIgnoreCase("pdf")) {
content = readPDF(strFilePath);
} else if (strPostFix.equalsIgnoreCase("ppt")) {
content = readPPT(strFilePath);
} else if (strPostFix.equalsIgnoreCase("pptx")) {
content = readPPTX(strFilePath);
} else {
content = "invalid file type!\n";
}
return content;
}
//xml or html
public static String readXMLorHTML(String strFilePath) {
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(strFilePath)));
StringBuffer sb = new StringBuffer();
String data = null;
while ((data = br.readLine()) != null) {
sb.append(data);
}
return sb.toString();
} catch (IOException e) {
//e.printStackTrace();
} finally {
try {
br.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return "error";
}
// txt
public static String readTXT(String strFilePath) {
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(strFilePath), "gbk"));
StringBuffer sb = new StringBuffer();
String data = null;
while ((data = br.readLine()) != null) {
sb.append(data);
}
System.out.println(sb.length());
return sb.toString();
} catch (IOException e) {
//e.printStackTrace();
} finally {
try {
br.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return "error";
}
// doc
public static String readWORD(String strFilePath) {
FileInputStream fs = null;
WordExtractor word = null;
//String content = "";
try {
fs = new FileInputStream(strFilePath);
word = new WordExtractor(fs);
String text = word.getText();
text = text.replaceAll("(\\r\\n){2,}", "\r\n");
text = text.replaceAll("(\\n){2,}", "\n");
fs.close();
return text;
} catch (Exception e) {
//e.printStackTrace();
} finally {
if (word != null) {
try {
word.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fs != null) {
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "error";
}
// docx
public static String readWORDX(String strFilePath) {
FileInputStream fs = null;
XWPFWordExtractor doc = null;
try {
fs = new FileInputStream(strFilePath);
XWPFDocument XDocument = new XWPFDocument(fs);
doc = new XWPFWordExtractor(XDocument);
String str = doc.getText();
doc.close();
return str;
} catch (Exception e) {
//e.printStackTrace();
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fs != null) {
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "error";
}
//xls
public static String readEXCEL(String strFilePath) {
FileInputStream fs = null;
HSSFWorkbook wb = null;
ExcelExtractor extractor = null;
//String content = "";
try {
fs = new FileInputStream(strFilePath);
wb = new HSSFWorkbook(new POIFSFileSystem(fs));
extractor = new ExcelExtractor(wb);
extractor.setFormulasNotResults(false);
extractor.setIncludeSheetNames(true);
String str = extractor.getText();
//content = changeCharSet(str, "UTF-8");
extractor.close();
return str;
} catch (Exception e) {
//e.printStackTrace();
} finally {
if (extractor != null) {
try {
extractor.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fs != null) {
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "error";
}
// xlsx
public static String readEXCELX(String strFilePath) {
FileInputStream fs = null;
XSSFWorkbook hwb = null;
XSSFExcelExtractor extractor = null;
//String content = "";
try {
fs = new FileInputStream(strFilePath);
hwb = new XSSFWorkbook(fs);
extractor = new XSSFExcelExtractor(hwb);
String str = extractor.getText();
//content = changeCharSet(str, "UTF-8");
extractor.close();
return str;
} catch (Exception e) {
//e.printStackTrace();
} finally {
if (extractor != null) {
try {
fs.close();
extractor.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fs != null) {
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "error";
}
// pdf
public static String readPDF(String strFilePath) {
File file = new File(strFilePath);
PDDocument doc = null;
PDFTextStripper pdfStripper = null;
try {
doc = PDDocument.load(file);
pdfStripper = new PDFTextStripper();
String str = pdfStripper.getText(doc);
//content = changeCharSet(str, "UTF-8");
doc.close();
return str;
} catch (Exception e) {
//e.printStackTrace();
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "error";
}
// ppt
public static String readPPT(String strFilePath) {
FileInputStream fs = null;
POITextExtractor extractor = null;
//String content = "";
try {
fs = new FileInputStream(strFilePath);
extractor = ExtractorFactory.createExtractor(fs);
String str = extractor.getText();
//content = changeCharSet(str, "UTF-8");
extractor.close();
return str;
} catch (Exception e) {
//e.printStackTrace();
} finally {
if (extractor != null) {
try {
extractor.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fs != null) {
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "error";
}
// pptx
public static String readPPTX(String strFilePath) {
FileInputStream fs = null;
POITextExtractor extractor = null;
//String content = "";
try {
fs = new FileInputStream(strFilePath);
extractor = ExtractorFactory.createExtractor(fs);
String str = extractor.getText();
//content = changeCharSet(str, "UTF-8");
extractor.close();
return str;
} catch (Exception e) {
//e.printStackTrace();
} finally {
if (extractor != null) {
try {
extractor.close();
} catch (IOEx