package com.sanluan.einvoice.service;
import java.awt.Rectangle;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
/**
* 专用于处理电子发票识别的类
*
* @author arthurlee
*
*/
public class PdfInvoiceExtractor {
public static Invoice extract(File file) throws IOException {
Invoice invoice = new Invoice();
PDDocument doc = PDDocument.load(file);
PDPage firstPage = doc.getPage(0);
int pageWidth = Math.round(firstPage.getCropBox().getWidth());
PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setSortByPosition(true);
String fullText = textStripper.getText(doc);
if (firstPage.getRotation() != 0) {
pageWidth = Math.round(firstPage.getCropBox().getHeight());
}
String allText = replace(fullText).replaceAll("(", "(").replaceAll(")", ")").replaceAll("¥", "¥");
{
String reg = "机器编号:(?<machineNumber>\\d{12})|发票代码:(?<code>\\d{12})|发票号码:(?<number>\\d{8})|:(?<date>\\d{4}年\\d{2}月\\d{2}日)"
+ "|校验码:(?<checksum>\\d{20}|\\S{4,})";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
while (matcher.find()) {
if (matcher.group("machineNumber") != null) {
invoice.setMachineNumber(matcher.group("machineNumber"));
} else if (matcher.group("code") != null) {
invoice.setCode(matcher.group("code"));
} else if (matcher.group("number") != null) {
invoice.setNumber(matcher.group("number"));
} else if (matcher.group("date") != null) {
invoice.setDate(matcher.group("date"));
} else if (matcher.group("checksum") != null) {
invoice.setChecksum(matcher.group("checksum"));
}
}
}
{
String reg = "合计¥?(?<amount>[^ \\f\\n\\r\\t\\v\\*]*)(?:¥?(?<taxAmount>\\S*)|\\*+)\\s";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
try {
invoice.setAmount(new BigDecimal(matcher.group("amount")));
} catch (Exception e) {
}
try {
invoice.setTaxAmount(new BigDecimal(matcher.group("taxAmount")));
} catch (Exception e) {
invoice.setTaxAmount(new BigDecimal(0));
}
}
}
if (null == invoice.getAmount()) {
String reg = "合\\u0020*计\\u0020*¥?(?<amount>[^ ]*)\\u0020+¥?(?:(?<taxAmount>\\S*)|\\*+)\\s";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(fullText);
if (matcher.find()) {
try {
invoice.setAmount(new BigDecimal(matcher.group("amount")));
} catch (Exception e) {
invoice.setAmount(new BigDecimal(0));
}
try {
invoice.setTaxAmount(new BigDecimal(matcher.group("taxAmount")));
} catch (Exception e) {
invoice.setTaxAmount(new BigDecimal(0));
}
}
}
{
String reg = "价税合计\\u0028大写\\u0029(?<amountString>\\S*)\\u0028小写\\u0029¥?(?<amount>\\S*)\\s";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
invoice.setTotalAmountString(matcher.group("amountString"));
try {
invoice.setTotalAmount(new BigDecimal(matcher.group("amount")));
} catch (Exception e) {
invoice.setTotalAmount(new BigDecimal(0));
}
}
}
{
String reg = "收款人:(?<payee>\\S*)复核:(?<reviewer>\\S*)开票人:(?<drawer>\\S*)销售方";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
invoice.setPayee(matcher.group("payee"));
invoice.setReviewer(matcher.group("reviewer"));
invoice.setDrawer(matcher.group("drawer"));
}
if (allText.indexOf("通行费") > 0 && allText.indexOf("车牌号") > 0) {
invoice.setType("通行费");
}
Pattern type00Pattern = Pattern.compile("(?<p>\\S*)通发票");
Matcher m00 = type00Pattern.matcher(allText);
if (m00.find()) {
invoice.setTitle(m00.group("p").replaceAll("(?:国|统|一|发|票|监|制)", "") + "通发票");
if (null == invoice.getType()) {
invoice.setType("普通发票");
}
} else {
Pattern type01Pattern = Pattern.compile("(?<p>\\S*)用发票");
Matcher m01 = type01Pattern.matcher(allText);
if (m01.find()) {
invoice.setTitle(m01.group("p").replaceAll("(?:国|统|一|发|票|监|制)", "") + "用发票");
if (null == invoice.getType()) {
invoice.setType("专用发票");
}
}
}
}
PDFKeyWordPosition kwp = new PDFKeyWordPosition();
Map<String, List<Position>> positionListMap = kwp
.getCoordinate(Arrays.asList("机器编号", "税率", "价税合计", "合计", "开票日期", "规格型号", "车牌号", "开户行及账号", "密", "码", "区"), doc);
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDFTextStripperByArea detailStripper = new PDFTextStripperByArea();
detailStripper.setSortByPosition(true);
{
Position machineNumber;
if (positionListMap.get("机器编号").size() > 0) {
machineNumber = positionListMap.get("机器编号").get(0);
} else {
machineNumber = positionListMap.get("开票日期").get(0);
machineNumber.setY(machineNumber.getY() + 30);
}
Position taxRate = positionListMap.get("税率").get(0);
Position totalAmount = positionListMap.get("价税合计").get(0);
Position amount = positionListMap.get("合计").get(0);
Position model = null;
if (!positionListMap.get("规格型号").isEmpty()) {
model = positionListMap.get("规格型号").get(0);
} else {
model = positionListMap.get("车牌号").get(0);
model.setX(model.getX() - 15);
}
List<Position> account = positionListMap.get("开户行及账号");
Position buyer;
Position seller;
if (account.size() < 2) {
buyer = new Position(51, 122);
seller = new Position(51, 341);
} else {
buyer = account.get(0);
seller = account.get(1);
}
int maqX = 370;
List<Position> mi = positionListMap.get("密");
List<Position> ma = positionListMap.get("码");
List<Position> qu = positionListMap.get("区");
for (int i = 0; i < mi.size();
- 1
- 2
- 3
前往页