使用 Python3 PyMuPDF Camelot 批量处理银行电子回单,PDF 分割,PDF 提取信息。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# A PDF splitting tool for Bank of ChengDe receipts
# Code written by das2m using Copilot on May 22, 2025, at 14:49.import os
import glob
import fitz # PyMuPDF
from tkinter import *
from tkinter import filedialog
from tkinter import messagebox
from tkinter import ttk
from datetime import datetime
import re
import camelot
import uuid
from termcolor import coloredORGANIZATIONNAME='Organization Name'class App:def __init__(self, root):self.root = rootself.root.title("PDF Split")self.root.geometry("800x600")# TKinter Frame 1: Place 2 buttonsself.frame_1st = Frame(root)self.load_button = Button(self.frame_1st, text="Load", command=self.load_files)self.split_button = Button(self.frame_1st, text="Split", command=self.split)self.frame_1st.pack(side="top", anchor="nw", padx=10, pady=10)self.load_button.pack(side="left")self.split_button.pack(side="left")# TKinter Frame 2: Place tableself.frame_2nd = Frame(root)columns = ["Index", "Filename", "Total Pages"]self.table = ttk.Treeview(self.frame_2nd, # father containercolumns=columns,show="headings", # hide the first columnselectmode='browse',height=10, # lines count)self.table.heading("Index", text='序号')self.table.heading("Filename", text='文件名')self.table.heading("Total Pages", text='总页数')self.table.column("Index", width=50)self.table.column("Filename", width=650)self.table.column("Total Pages", width=100)self.frame_2nd.pack(fill=BOTH, expand=1, padx=10, pady=10)self.table.pack()# TKinter Frame 3: Output informationself.frame_3rd = Frame(root)self.status_bar_label=Label(self.frame_3rd, text="Output:")self.status_bar_text = Text(self.frame_3rd)self.frame_3rd.pack(fill=BOTH, expand=1, padx=10, pady=10)self.status_bar_label.pack(side='top', anchor='nw')self.status_bar_text.pack(fill=BOTH, expand=1)self.pdfs = []def display_status(self, message):self.status_bar_text.tag_add('purple', END)self.status_bar_text.tag_config('purple', foreground='purple')self.status_bar_text.insert(END, message, 'purple')self.status_bar_text.insert(END, '\n', 'purple')self.status_bar_text.see(END)def load_files(self):self.pdfs = filedialog.askopenfilenames(filetypes=(("PDF files", "*.pdf"), ("All files", "*.*")))for i, pdf in enumerate(self.pdfs):doc = fitz.open(pdf)self.table.insert("", "end", values=(i + 1, pdf, doc.page_count, "", ""))self.display_status(f"File {pdf} has loaded!\n")def match_date(self,text):pattern1 = r'(\d{4}年\d{2}月\d{2}日)'match = re.search(pattern1, text)if match:date_str = match.group(1)return date_str.replace("年", "-").replace("月", "-").replace("日", "")else:return Nonedef get_table_info(self, table):print(colored("=====Upper table=====","blue"))#for i in range(table.shape[0]):# for j in range(table.shape[1]):# print(f"Row {i}, Column {j}: {table.iloc[i, j]}")info={}info['payer_name']=table.iloc[0,3] # Payer account nameif(info['payer_name']==ORGANIZATIONNAME):info['direction']='pay'else:info['direction']='recipent'# 'Bank branch code \n5900369300013\n313142782061'#info['payer_bank_account']=re.findall(r'\n(\d*)\n',table.iloc[1,3])[0] # Payer account number#info['payer_bank_code']=re.findall(r'\n\d*\n(\d{12})',table.iloc[1,3])[0] # Payer bank branch code#info['payer_bank_name']=table.iloc[2,3] # Payer bank nameinfo['recipient_name']=table.iloc[3,3] # Recipient account name# '50902401040005190\nBank branch code \n103142790248'#info['recipient_account']=re.findall(r'(\d*)\n',table.iloc[4,3])[0] # Recipient account number#info['recipient_code']=re.findall(r'\n.*\n(\d{12})',table.iloc[4,3])[0] # Recipient bank branch code#info['recipient_bank']=table.iloc[5,3] # Recipient bank nameinfo['amount']=re.findall(r'¥(\d*\.\d{2})\n',table.iloc[6,0])[0] # Amount info['purpose']=re.findall('交易渠道\n(.*)\n',table.iloc[7,1])[0] # Purposeinfo['transaction_number']=re.findall(r'\n(\d+)$',table.iloc[10,1])[0] # Transaction numberinfo['verification_code']=re.findall(r'\b[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}\b',table.iloc[11,1]) # Verification codeinfo['print_count']=re.findall(r'打印次数 \n(\d)',table.iloc[11,1])[0] # Print countreturn infodef get_table_info_down(self, table):print(colored("=====Lower table=====","green"))#for i in range(table.shape[0]):# for j in range(table.shape[1]):# print(f"Row {i}, Column {j}: {table.iloc[i, j]}")info={}info['payer_name']=table.iloc[0,2] # Payer account nameif(info['payer_name']==ORGANIZATIONNAME):info['direction']='pay'else:info['direction']='recipent'info['recipient_name']=table.iloc[3,2] # Recipient account nameinfo['amount']=re.findall(r'¥(\d*\.\d{2})\n',table.iloc[6,0])[0] # Amount info['purpose']=re.findall('交易渠道\n(.*)\n',table.iloc[7,0])[0] # Purposeinfo['transaction_number']=re.findall(r'\n(\d+)\n',table.iloc[10,0])[0] # Transaction numberinfo['verification_code']=re.findall(r'\b[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}\b',table.iloc[11,0]) # Verification codeinfo['print_count']=re.findall(r'\n(\d)\n',table.iloc[11,0])[0] # Print countreturn infodef split(self):# Clean up target and temp directoriesif os.path.exists('target'):for file in glob.glob('target/*'):os.remove(file)else:os.mkdir('target')if os.path.exists('temp'):for file in glob.glob('temp/*'):self.display_status(f"Found {file} in temp directory, Now deleting it!")os.remove(file)else:os.mkdir('temp')for item in self.table.get_children():values = self.table.item(item)["values"]# Open PDF filedoc = fitz.open(values[1])# Iterate through PDF file (by page count)for page_number in range(len(doc)):# Create a new PDF documentnew_doc = fitz.open()# Copy the current page from the original documentnew_doc.insert_pdf(doc, from_page=page_number, to_page=page_number)# Generate output filenameoutput_file=f"temp/{uuid.uuid4().hex}.pdf"# Save the new documentnew_doc.save(output_file)# Close the new documentnew_doc.close()self.display_status(f"Splitting page {page_number + 1} from {values[1]}")# Get the page to splitpage = doc[page_number]# Calculate the rectangular areas for the upper and lower partspage_rect = page.rectrect1 = fitz.Rect(page_rect.x0, page_rect.y0, page_rect.x1, page_rect.y1 / 2)rect2 = fitz.Rect(page_rect.x0, page_rect.y1 / 2, page_rect.x1, page_rect.y1)# Create two new PDF files to save the split partsnew_doc1 = fitz.open()new_doc2 = fitz.open()# Add the split pages to the new PDF filesnew_page1 = new_doc1.new_page(width=page.rect.width, height=page.rect.height / 2)new_page1.set_mediabox(rect1)new_page1.show_pdf_page(new_page1.rect, doc, page_number, clip=rect1)page_text1= new_page1.get_text("raw")if len(page_text1)>0:date1 = self.match_date(page_text1)new_page2 = new_doc2.new_page(width=page.rect.width, height=page.rect.height / 2)new_page2.set_mediabox(rect2)new_page2.show_pdf_page(new_page2.rect, doc, page_number, clip=rect2)page_text2= new_page2.get_text("raw")if len(page_text2)>0:date2 = self.match_date(page_text2)# Use camelot to read the temporary filetables = camelot.read_pdf(output_file, pages="1", flavor='lattice')if tables.n>0 and not tables[0].df.empty:info = self.get_table_info(tables[0].df)info['date']=date1# Save final files with specified namestarget1 = f"target/{info['date']}-{info['transaction_number']}-{info['purpose']}-{info['payer_name']}-{info['recipient_name']}-{info['amount']}-{info['direction']}-{info['print_count']}.pdf"if os.path.exists(target1):self.display_status(f"File {target1} already exists, Overwriting it!")new_doc1.save(target1)# Close documentnew_doc1.close()if len(tables)>1 and not tables[1].df.empty:info2 = self.get_table_info_down(tables[1].df)info2['date']=date2# Save final files with specified namestarget2 = f"target/{info2['date']}-{info2['transaction_number']}-{info2['purpose']}-{info2['payer_name']}-{info2['recipient_name']}-{info2['amount']}-{info2['direction']}-{info2['print_count']}.pdf"if os.path.exists(target2):self.display_status(f"File {target2} already exists, Overwriting it!")new_doc2.save(target2)new_doc2.close()# Delete temporary fileos.remove(output_file)doc.close()self.display_status(f"Done! You can find the files in the target directory!")# Creating tkinter window
root = Tk()
app = App(root)
root.mainloop()