我正在尝试使用Textract提取doem PDF文件中的文本。但是,当我打印代码末尾的文本时,它只打印出许多空格。有谁能告诉我发生了什么事吗?(顺便说一句,文本不是= "“)
import os
import codecs
import PyPDF2
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
for filename in os.listdir('Harbour PDF'):
if '.DS_Store' == filename:
continue
filename = 'Harbour PDF/' + filename
print(filename)
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
text += pageObj.extractText()
if text != "":
text = text
else:
text = textract.process(pdfFileObj, method='tesseract', language='eng')
print(text)
发布于 2019-03-23 01:02:12
2个我通过python使用的函数(第二个需要tesseract)。实际上,我更喜欢tesseract one而不是pdfminer,但它们实际上做的是一样的事情。不知道你的代码出了什么问题,但我相信这些是可选的等价物。
from PIL import Image
import pytesseract
import cv2
import os
import subprocess
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
def to_txt(pdf_path, output_dir, name=None):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if name == None:
parts = pdf_path.split('\\')
fname = parts[-1][:-4] + '.txt'
fname = fname.replace(" ", "_")
end = output_dir + fname
else:
if name[-4:] != '.txt':
name += '.txt'
end = output_dir + name
cmd = ['pdftotext', pdf_path,
end]
subprocess.call(cmd)
print('Converted')
https://stackoverflow.com/questions/54195882
复制相似问题