前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >谷歌学术搜索文献_谷歌学术论文翻译

谷歌学术搜索文献_谷歌学术论文翻译

作者头像
全栈程序员站长
发布2022-11-10 16:01:02
7150
发布2022-11-10 16:01:02
举报
文章被收录于专栏:全栈程序员必看

两个py文件

起主要作用的Search&Download.py

代码语言:javascript
复制
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from Download import Hubber
import xlwt,os
from time import sleep
from tqdm import tqdm
TotalNum=0
class Article(object):
title = ""
article_link = ""
authors = ""
authors_link = ""
abstract = ""
def __init__(self):
title = "New Paper"
def save_xls(sheet, paper):
# 将数据按列存储入excel表格中
global TotalNum
sheet.write(TotalNum, 0, TotalNum)
sheet.write(TotalNum, 1, paper.title)
sheet.write(TotalNum, 2, paper.article_link)
sheet.write(TotalNum, 3, paper.journal)
sheet.write(TotalNum, 4, paper.authors_link)
sheet.write(TotalNum, 5, paper.abstract)
TotalNum += 1
head = { \
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' \
}  # 20210607更新,防止HTTP403错误
article_titles = []
article_links = []
def GetInfo(sheet,url):
r = requests.get(url, headers=head)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
#print("\n"+soup)
articles = soup.find_all(class_="gs_ri")
for article in articles:
paper =Article()
try:
title = article.find('h3')
paper.title = title.text
#print("\n"+paper.title)
article_titles.append(paper.title)
paper.article_link = title.a.get('href')
#print("\n"+paper.article_link)
article_links.append(paper.article_link)
journal = article.find(class_="gs_a")
paper.journal =journal.text
#print("\n"+paper.authors)
authors_addrs = journal.find_all('a')
for authors_addr in authors_addrs:
#print("\n"+authors_addr.get('href'))
paper.authors_link=paper.authors_link +(authors_addr.get('href'))+"\n"
abstract = article.find(class_="gs_rs")
paper.abstract = abstract.text
#print("\n"+paper.abstract)
except:
continue
save_xls(sheet,paper)
return
def getArticle(article_titles,article_links):
dir = ".\\Articles\\" +keyword +"\\"
#print (dir)
if os.path.exists(dir) == False:
os.mkdir(dir)
for k in tqdm(range(len(article_titles))):
article_titles[k]="{0}".format(article_titles[k].replace(':', ' ')).replace('.', '')
path = dir + article_titles[k] + ".pdf"
#print("\n"+path)
try:
Hubber.getPDF(article_links[k],path)
sleep(0.5)
except:
continue
if __name__ == '__main__':
myxls = xlwt.Workbook()
sheet1 = myxls.add_sheet(u'PaperInfo', True)
column = ['序号', '文章题目','文章链接','期刊', '作者链接', '摘要']
for i in range(0, len(column)):
sheet1.write(TotalNum, i, column[i])
TotalNum+=1
keyword=input("keywords is?\n")
#keyword = diabetes and conjunctiva and (microcirculation or microvasculature)
#print("\n"+keyword)
key = keyword.replace(" ","+")
info = keyword + "_PaperInfo.xls"
print("\n"+"检索中……")
if os.path.exists(info) == True:
print("\n" + "PaperInfo already exists!")
else:
start = 0
for i in tqdm(range(10)):
url = 'https://xs.dailyheadlines.cc/scholar?start=' + str(start) + '&q=' + key + '&hl=zh-CN&as_sdt=0,5'
start = start + 10
GetInfo(sheet1,url)
myxls.save(keyword+'_PaperInfo.xls')
sleep(0.5)
print("\n"+"检索完成")
print("\n"+"下载中……")
if len(article_titles) != 0:
getArticle(article_titles, article_links)
else:
import xlrd
data = xlrd.open_workbook(info)
table = data.sheet_by_index(0)
article_titles = table.col_values(1)[1:]
article_links = table.col_values(2)[1:]
#print("\n"+article_titles)
#print("\n"+article_links)
getArticle(article_titles, article_links)
print("\n"+ "下载完成")

起辅助作用的Download.py,可将更多网站补充进去!

代码语言:javascript
复制
import os.path
import re
import requests
from bs4 import BeautifulSoup
class Hubber:
head = { \
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' \
}  # 20210607更新,防止HTTP403错误
def pdf_hub(url,path):
try:
pdf = requests.get(url, headers=Hubber.head)
with open(path, "wb") as f:
f.write(pdf.content)
print("\n"+"pdf found directly!")
except:
print("\n"+"failed to download pdf directly!\n" +url)
Hubber.err_log(url)
def sci_hub(path,doi):
doi = str(doi).split("https://doi.org/")[1]
url = "https://www.sci-hub.ren/doi:" + doi + "#"
r = requests.get(url, headers=Hubber.head)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
download_url = soup.iframe.attrs["src"]
try:
download_r = requests.get(download_url, headers=Hubber.head)
download_r.raise_for_status()
with open(path, "wb+") as temp:
temp.write(download_r.content)
print("\n"+"Article downloaded by doi!")
except:
print("\n"+"failed to download pdf by doi!\n" +url)
Hubber.err_log(url)
def err_log(url):
with open("download_err.txt", "a+", encoding="utf-8") as error:
error.write("PDF not found,download link may be: \n"+url +"\n")
def getSoup(url):
r = requests.get(url, headers=Hubber.head)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
return soup
def getPDF(url,path):
if os.path.exists(path) == True:
print("\n" + "Article already exists")
else:
if (len(re.findall('pdf', url)) != 0):
print ("\n"+'pdf link already!')
Hubber.pdf_hub(url,path)
elif re.match("https://www.sci-hub.ren/",url):
print("\n" + 'sci_hub link!')
url = str(url).replace("https://www.sci-hub.ren/","https://doi.org/")
Hubber.sci_hub(path,url)
#if pdf can be easily found!
elif re.match("https://academic.oup.com/", url):
soup = Hubber.getSoup(url)
pdf_link ="https://academic.oup.com"+soup.find(class_="al-link pdf article-pdfLink").get('href')
#print("\n"+pdf_link)
Hubber.pdf_hub(pdf_link,path)
'''
doi = soup.select('div[class="ww-citation-primary"]')[0].a.get('href')
#print("\n"+doi)
Hubber.sci_hub(path,doi)
'''
elif re.match("https://content.iospress.com/", url):
soup = Hubber.getSoup(url)
pdf_link = soup.find(class_="btn btn-download btn-right get-pdf").get('href')
# print("\n"+pdf_link)
Hubber.pdf_hub(pdf_link, path)
elif re.match("https://wwwnature.53yu.com/", url):
soup = Hubber.getSoup(url)
pdf_link = soup.find(class_="c-pdf-download__link").get('href')
#print("\n"+pdf_link)
Hubber.pdf_hub(pdf_link, path)
elif re.match("https://bjo.bmj.com/", url):
soup = Hubber.getSoup(url)
pdf_link = soup.find(class_="article-pdf-download").get('href')
pdf_link = "https://bjo.bmj.com" + pdf_link
#print("\n"+pdf_link)
Hubber.pdf_hub(pdf_link,path)
elif re.match("https://jamanetwork.com/", url):
soup = Hubber.getSoup(url)
pdf_link = soup.find(class_="toolbar-tool toolbar-pdf al-link pdfaccess").get('data-article-url')
pdf_link = "https://jamanetwork.com" + pdf_link
#print("\n"+pdf_link)
Hubber.pdf_hub(pdf_link, path)
#if pdf can't be easily found,but doi can!
elif re.match("https://sciencedirect.53yu.com/", url):
soup = Hubber.getSoup(url)
doi = soup.find(class_="doi").get('href')
Hubber.sci_hub(path, doi)
elif re.match("https://diabetes.diabetesjournals.org/", url):
soup = Hubber.getSoup(url)
doi = soup.select('.citation-doi')[0].a.get('href')
Hubber.sci_hub(path, doi)
elif re.match("https://journals.lww.com/", url):
soup = Hubber.getSoup(url)
doi = "https://doi.org/" + str(soup.find(id="ej-journal-doi").text).split("doi: ")[1]
Hubber.sci_hub(path, doi)
else:
'''
https://europepmc.org/
https://iovs.arvojournals.org/
https://linkspringer.53yu.com/
'''
print("\n"+"To be prettified!Download link may be: " +"\n" +url)
Hubber.err_log(url)
if __name__ == '__main__' :
url = "https://www.nature.com/articles/s41598-021-87315-7.pdf"
url1 = "https://www.sci-hub.ren/doi:10.1067/mva.2003.139#"
url2 = "https://www.sci-hub.ren/doi:10.1067/mva.2003.139#"
Hubber.getPDF(url,"test.pdf")
Hubber.getPDF(url1,"test1.pdf")
Hubber.getPDF(url2,"test2.pdf")

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。

发布者:全栈程序员栈长,转载请注明出处:https://javaforall.cn/183156.html原文链接:https://javaforall.cn

本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2022年10月11日,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档