我下载了整个JSON进行文本挖掘,为了进行预处理,我将其解析为PubMed,以减小其大小,并消除任何不相关或难以挖掘的信息,例如书目。然而,整个文档的大小是25 50,目前的ETA大约是50小时。下面是我的Python脚本。我已经尝试了多处理,它将速度提高了大约3倍。此外,我检查了一下时间,发现瓶颈(大约90%的运行时)是由行tree = BS(f.read(), features='lxml-xml')造成的,所以我不认为正则表达式是问题所在。有没有其他方法可以提高速度?
import glob
import json
import multiprocessing as mp
import os
import re
from bs4 import BeautifulSoup as BS
from tqdm import tqdm
skipped = 0
files = tuple(glob.iglob(r'*\*.nxml'))
pbar = tqdm(total=len(files))
def xml2json(filename, logging=False):
if logging:
tqdm.write("Now parsing {}".format(filename))
with open(filename, 'r', encoding='utf-8') as f:
# start = time.time()
tree = BS(f.read(), features='lxml-xml')
# print("elapsed time " + str(time.time() - start))
dic = {
'publisher': {
'name': "", # tree.find('publisher-name').text,
'loc': "", # tree.find('publisher-loc').text
},
"id": tree.find('article-id').text,
'title': tree.find('article-title').text.strip(),
'contributors': [],
"pub": {
"volume": "",
"issue": "",
"day": "",
"month": "",
"year": "",
},
"abstract": "",
"body": "",
"body_headings": ""
}
# start = time.time()
for tag in ("volume", "issue", "day", "month", "year"):
node = tree.find(tag)
if node:
dic["pub"][tag] = node.text
node = tree.find('publisher-name')
if node:
dic["publisher"]["publisher-name"] = node.text
node = tree.find('publisher-loc')
if node:
dic["publisher"]["publisher-loc"] = node.text
contributors = []
branch = tree.find("contrib-group")
if branch:
for node in branch.findAll("contrib"):
contributors.append("{}, {}".format(node.find("surname").text, node.find("given-names").text))
dic["contributors"] = contributors
abstract = ""
branch = tree.find("abstract")
if not branch:
return None
for node in branch.find_all(["p"]):
if node.text == "Supporting Information":
break
text = "\n" + node.text.replace("\n", "").strip()
text = re.sub("[\(\[].*?[\)\]]", "", text)
text = re.sub(" {2,}", " ", text)
text = re.sub(" \.", ".", text)
abstract += text
dic["abstract"] = abstract
body = ""
body_headings = ""
branch = tree.find("body")
if not branch:
return None
for node in branch.find_all(["title", "p"]):
if node.text == "Supporting Information":
break
if node.name == "title":
text = "\n"
else:
text = ""
text += "\n" + node.text.replace("\n", "").strip()
text = re.sub("[\(\[].*?[\)\]]", "", text)
text = re.sub(" {2,}", " ", text)
text = re.sub(" (\.|\,)", "\g<1>", text)
body_headings += text
if node.name == "p":
body += text
dic["body"] = body
dic["body_headings"] = body_headings
# print(time.time() - start)
return dic
def parse(file):
_, name = os.path.split(file)
name, _ = os.path.splitext(name)
with open("json/{}.json".format(name[3:]), "w") as f:
dic = xml2json(file, logging=False)
if dic:
json.dump(dic, f)
else:
global skipped
skipped += 1
# tqdm.write("Skipping!")
def callback(m):
# print(m)
pbar.update(1)
def error_callback(e):
print(e)
if __name__ == '__main__':
tqdm.write("Found {} files...".format(len(files)))
pool = mp.Pool()
for filepath in files:
pool.apply_async(parse, (filepath,), callback=callback, error_callback=error_callback)
pool.close()
pool.join()
pbar.close()
print("Done, skipped {}".format(skipped))发布于 2020-04-29 06:45:55
BeautifulSoup用python进行解析,效率不如C。另外,html比XML更混乱,增加了解析的负担。我相信nXML文件是完全符合XML的,所以基于C的lxml解析器应该要快得多。
https://stackoverflow.com/questions/61486194
复制相似问题