import PyPDF4
import pikepdf
import fitz
#对pdf文件进行简单的解密
def jiemi(pdfpath):new_pdfpath = pdfpath[:-4] + '_new' + pdfpath[-4:]fp = open(pdfpath, "rb+")pdfFile = PyPDF4.pdf.PdfFileReader(fp)# pdf 解密if pdfFile.isEncrypted:pdf = pikepdf.open(pdfpath, password='')pdf.save(new_pdfpath)return new_pdfpath #将每一页转化为图片并保存
def pdf_image(pdf_name):img_paths = []pdf = fitz.Document(pdf_name)for i,pg in enumerate(range(0, pdf.pageCount)):page = pdf[pg] # 获得每一页的对象trans = fitz.Matrix(3.0, 3.0).preRotate(0)pm = page.getPixmap(matrix=trans, alpha=False) # 获得每一页的流对象# pm.writePNG(dir_name + os.sep + base_name[:-4] + '_' + '{:0>3d}.png'.format(pg + 1)) # 保存图片img_path = pdf_name[:-4] + '_' + str(pg+1) + '.jpg'pm.writePNG(img_path) # 保存图片img_paths.append(img_path)pdf.close()return img_paths
如果报错:
ModuleNotFoundError: No module named ‘frontend’
请参考链接: https://blog.csdn.net/xc_zhou/article/details/102596542
import osimport fitz
import json
# os.chdir("data")
# print(os.listdir())
from collections import namedtuple
TextUnit = namedtuple('TextUnit', ['x0', 'y0', 'x1', 'y1', 'span', 'page_no'])
base_path = os.path.dirname(os.path.abspath(__file__))data_dir_path = os.path.join(base_path, "data")os.chdir(data_dir_path)tasks = {os.path.splitext(file)[0]:os.path.join(data_dir_path, file) for file in os.listdir()}# 转 txt
def to_txt():tasks_len = len(tasks)for index, fname in enumerate(tasks):print("{}-{}".format(index, tasks_len))os.chdir(os.path.join(base_path, "txt2"))dir_name = str(index) + "-"+fnameos.mkdir(dir_name)os.chdir(os.path.join(base_path, "txt2", dir_name))doc = fitz.open(tasks[fname])size_limit=-1nl_char='\n'sep_char=' 'for page_no, page in enumerate(doc):# if page_no != 3:# continuetext_list = []page_num = str(page_no)page_len = len(page_num)page_num = page_num if page_len >= 3 else "0"+page_num if page_len == 2 else "00"+page_numnew_line = Truejson_obj = json.loads(page.getText('json'))for block in json_obj.get('blocks', []):if new_line and text_list:text_list.append("\n")for line in block.get('lines', []):for span in line.get('spans', []):text = span.get('text', '')text_list.append(text)x0, y0, x1, y1 = span['bbox']new_line = text.endswith(' ')new_page = Falseout = open(fname + "-" + page_num + ".txt", "wb") text = "".join(text_list).encode("utf8") out.write(text) out.close()# 转图片
def to_img():for index, fname in enumerate(tasks):os.chdir(os.path.join(base_path, "img"))dir_name = str(index) + "-"+fnameos.mkdir(dir_name)os.chdir(os.path.join(base_path, "img", dir_name))doc = fitz.open(tasks[fname])for index, page in enumerate(doc):pix = page.getPixmap(alpha=False)pageimg_name = "page-%i.png" % page.numberpix.writePNG(pageimg_name)# 截取
def to_delete():for index, fname in enumerate(tasks):os.chdir(os.path.join(base_path, "to_pdf"))dir_name = str(index) + "-"+fnameos.mkdir(dir_name)os.chdir(os.path.join(base_path, "to_pdf", dir_name))doc = fitz.open(tasks[fname])doc2 = fitz.open()doc2.insertPDF(doc, to_page = 9) # first 10 pages# doc2.insertPDF(doc1, from_page = len(doc1) - 10) # last 10 pagesdoc2.save("first-and-last-10.pdf")# for index, page in enumerate(doc):# pix = page.getPixmap(alpha=False)# pageimg_name = "page-%i.png" % page.number# pix.writePNG(pageimg_name)