Python批量提取PDF发票信息保存至Excel文件并对文件重命名

共2个文件

exe：1个

py：1个

python

开发语言

需积分: 41 177 浏览量 2022-02-12 15:36:04 上传评论 2 收藏 45.92MB ZIP 举报

资源详情

资源评论

资源推荐

收起资源包目录

dist.zip （2个子文件）

发票信息提取保存Excel.exe 46.25MB

发票信息提取保存Excel.py 6KB

import pdfplumber import re import os import xlwt from gooey import Gooey, GooeyParser # 可访问文章查看具体文档：https://juejin.cn/user/1495767474780568 def search(bt, text): m1 = re.search(bt, text) if m1 is not None: return re_block(m1[0]) def re_name(full_path, company, price): if not company: return company = company.replace('集团', '').replace('科技', '').replace('投资', '') \ .replace('有限公司', '').replace('分公司', '').replace('公司', '') if '滴滴' in company: company = '滴滴' if '中国石油' in company: company = '中石油' if '中石化' in company: company = '中石化' if '中国石化' in company: company = '中国石化' if '中国移动' in company: company = '中国移动' if '中国联通' in company: company = '中国联通' # 设置新文件名 new_file_name = company + '-' + str(price) + '.pdf' new_name = full_path[:full_path.rfind('\\')] + os.sep + new_file_name # print(new_name) # 重命名 os.rename(full_path, new_name) # 用os模块中的rename方法对文件改名 return new_file_name def re_block(text): text = text.replace(' ', '').replace('　', '').replace('）', '').replace(')', '').replace('：', ':') \ .replace('¥', ':').replace('￥', ':') arr = text.split(':') if len(arr) > 1: return arr[1] return text def get_pdf(dir_path): pdf_file = [] for root, sub_dirs, file_names in os.walk(dir_path): for name in file_names: if name.endswith('.pdf'): filepath = os.path.join(root, name) pdf_file.append(filepath) return pdf_file titles = ['发票标题', '发票号码', '开票日期', '公司', '纳税人识别号', '发票金额', '开票公司', '文件名称'] # 创建字体对象 font = xlwt.Font() # 字体加粗 font.bold = True style1 = xlwt.XFStyle() style1.font = font # dir_path_init = 'C:\\Users\\Administrator\\Desktop\\a' def read(dir_pdf_path): filenames = get_pdf(dir_pdf_path) total_price = 0.0 total_price_str = '' # excel 文件 wbk = xlwt.Workbook() sheet = wbk.add_sheet('sheet 1') for i in range(len(titles)): sheet.write(0, i, titles[i], style1) # 第0行第一列写入内容 pdf_file_size = len(filenames) if pdf_file_size <= 0: print('无pdf文件') return if_not_contain_invoice_file = True for i in range(pdf_file_size): full_path = filenames[i] company_all = '' price_num = '' print(full_path) with pdfplumber.open(full_path) as pdf: first_page = pdf.pages[0] pdf_text = first_page.extract_text() if '发票' not in pdf_text: continue if_not_contain_invoice_file = False # print(pdf_text) # print('----------------------------------------------------------------') result = [search(re.compile(r'[\u4e00-\u9fa5]+电子普通发票.*?'), pdf_text)] t2 = search(re.compile(r'[\u4e00-\u9fa5]+专用发票.*?'), pdf_text) if t2: print(t2) # print(search(re.compile(r'发票代码(.*\d+)'), pdf_text)) result.append(search(re.compile(r'发票号码(.*\d+)'), pdf_text)) result.append(search(re.compile(r'开票日期(.*)'), pdf_text)) co = search(re.compile(r'名\s*称\s*[:：]\s*([\u4e00-\u9fa5]+)'), pdf_text) # print(co) result.append(co) code = search(re.compile(r'纳税人识别号\s*[:：]\s*([a-zA-Z0-9]+)'), pdf_text) result.append(code) # print(code) price = search(re.compile(r'小写.*(.*[0-9.]+)'), pdf_text) if price: price_num = search(re.compile(r'[0-9.]+'), price) if price_num: total_price += float(price_num) if total_price_str: total_price_str += '+' total_price_str += price_num + '' result.append(price) # 开票公司 company_all = re.findall(re.compile(r'名.*称\s*[:：]\s*([\u4e00-\u9fa5]+)'), pdf_text) if company_all: company_all = re_block(company_all[len(company_all) - 1]) result.append(company_all) else: result.append('') # 保存xls for j in range(len(result)): sheet.write(i + 1, j, result[j]) # 第0行第一列写入内容 # 重命名文件 new_path = re_name(full_path, company_all, price_num) if not new_path: new_path = full_path # 文件路径 sheet.write(i + 1, len(titles) - 1, new_path) if if_not_contain_invoice_file: print('不包含发票文件') print('总价格：' + str(round(total_price, 2))) # 价格字符串 # print(total_price_str) wbk.save(dir_pdf_path + os.sep + '发票提取.xls') description = '1、提取.pdf发票内容至（发票提取.xls）文件\n2、将发票重命名为公司+金额' @Gooey(program_name="PDF发票信息提取", language='chinese', encoding='cp936', menu=[{ 'name': '菜单', 'items': [{ 'type': 'AboutDialog', 'menuTitle': '关于', 'name': 'PDF发票信息提取', 'description': description, 'version': '1.0', 'copyright': '2022', 'website': 'https://blog.csdn.net/PromiseTo?spm=1010.2135.3001.5421', 'developer': '651461243@qq.com', 'license': 'MIT' }] }] ) def main(): parser = GooeyParser(description=description) parser.add_argument('目录', help='请选择包含pdf发票的文件夹', widget="DirChooser") # 文件选择框 args = parser.parse_args() # 接收界面传递的参数 # print(args) print('---------------------------------------------------') read(args.目录) print('执行完成') if __name__ == '__main__': main()