需要使用python从具有不同PDF结构的发票PDF文件中提取特定的文本,并将输出数据存储到特定的excel列中。所有PDF文件都有不同的结构,但内容值相同。
试图解决这个问题,但不能只提取特定的文本值。
PDF文件样本:
需要提取发票ID,发行日期,主题,金额从整个PDF文件。
到目前为止我使用的脚本:
import PyPDF2
import re
pdfFileObj = open('test.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
text = str(pageObj.extractText())
quotes = re.findall(r'"[^"]*"',text)
print(quotes)发布于 2020-10-04 15:58:37
您有一个非常好的pdf文档,因为您的pdf有表单字段,所以您可以直接使用它们来读取数据:
import PyPDF2
pdfFileObj = open('test.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
fields = pdfReader.getFormTextFields()
print(fields["Invoice ID"])
print(fields["Issue Date"])
print(fields["Subject"])
print(fields["Amount Due"])编辑:I将您所请求的数据(从这里:如何使用python从PDF文件中只提取特定的文本)合并到一个小脚本中,并提供了3个解析pdf的机会(针对您的3个pdfs)。问题是你的pdfs有很多不同的地方,而且包在不同的pdfs上也有一些优势,所以我认为你必须把这些东西结合起来。问题是,你尝试所有的函数,直到得到结果。希望这对你来说是个好的开始。如果您有更多不同的pdfs,并且可能必须将所有regex (每个字段)存储在一个数组中,并在不同的函数上使用它们,那么您可能必须更改regexes,这样您就有了3个解析函数和4个regexes列表用于其中的两个函数。
import PyPDF2
import re
import os
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
def parse_pdf_by_regex_2(filename: str) -> dict:
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
regex_invoice_no = re.compile(r"Invoice No.:\s*(\w+)\s")
regex_order_no = re.compile(r"IRN:\s*(\d+)")
regex_due_date = re.compile(r"Due Date: (\d{2}\.\d{2}\.\d{4})")
regex_total_due = re.compile(r"([\d,.]+) \n\nTotal Invoice Value\(in words\)")
try:
return {"invoice_id": re.search(regex_invoice_no, output_string.getvalue()).group(1),
"issue_date": re.search(regex_due_date, output_string.getvalue()).group(1),
"subject": re.search(regex_order_no, output_string.getvalue()).group(1),
"amount": re.search(regex_total_due, output_string.getvalue()).group(1)}
except AttributeError as err:
print("Not all elements have been found")
return {}
def parse_pdf_by_form_fields(filename: str) -> dict:
with open(filename, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
try:
fields = pdf_reader.getFormTextFields()
except TypeError as err:
# print("No FormFields available")
return {}
try:
# You can also check if onyly missing some values, maybe this can happen, but this is up to your data
return {"invoice_id": fields["Invoice ID"],
"issue_date": fields["Issue Date"],
"subject": fields["Subject"],
"amount": fields["Amount Due"]}
except KeyError as err:
# print(f"Key not found: '{err.args[0]}'")
return {}
def parse_pdf_by_regex(filename: str) -> dict:
with open(filename, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
text_data = ""
for page_no in range(pdf_reader.getNumPages()):
text_data += pdf_reader.getPage(page_no).extractText()
regex_invoice_no = re.compile(r"Invoice Number\s*(INV-\d+)")
regex_order_no = re.compile(r"Order Number(\d+)")
regex_due_date = re.compile(r"Due Date(\S+ \d{1,2}, \d{4})")
regex_total_due = re.compile(r"Total Due(\$\d+\.\d{1,2})")
try:
return {"invoice_id": re.search(regex_invoice_no, text_data).group(1),
"issue_date": re.search(regex_due_date, text_data).group(1),
"subject": re.search(regex_order_no, text_data).group(1),
"amount": re.search(regex_total_due, text_data).group(1)}
except AttributeError as err:
# print("Not all elements have been found")
return {}
def parse_pdf(filename: str) -> dict:
# Hint: ':=' is available since pythoon 3.8
if data := parse_pdf_by_form_fields(filename=fname):
return data
elif data := parse_pdf_by_regex(filename=fname):
return data
elif data := parse_pdf_by_regex_2(filename=fname):
return data
else:
print("No data found")
return {}
if __name__ == '__main__':
for fname in os.listdir("."):
if fname.startswith("testfile"):
print(f"check {fname}")
print(parse_pdf(filename=fname))https://stackoverflow.com/questions/64196016
复制相似问题