Python爬取一个网站

# coding:utf-8
import requests
from bs4 import BeautifulSoup
import json
import time
import datetime
import pymysql
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# 获取文章内容方法
def getartinfo( url ):
    page = requests.get(url).content
    soup = BeautifulSoup(page,'lxml')
    res={}
    res['curr'] = soup.find('div',class_="comilla-cerrar").string.strip()
    res['title'] = soup.find('h2',class_="articulo-titulo").string.strip()
    res['auchor'] = soup.find('p',class_="articulo-autor").string.strip()
    res['contents'] =soup.find('div',class_="articulo-contenido")
    res['add_time'] = (int)(time.time())
    return res
# 获取问答内容方法
def getqueinfo( url ):
    page = requests.get(url).content
    soup = BeautifulSoup(page,'lxml')
    res={}
    res['title'] = soup.find('h4').string.strip()
    res['curr'] = soup.find('div',class_="cuestion-contenido").string.strip()
    res['auchor'] = soup.find('p',class_="cuestion-editor").string.strip()
    res['contents'] =soup.find_all('div',class_="cuestion-contenido")[1]
    res['add_time'] = (int)(time.time())
    return res
# 抓取“一个每日文章和问答”
url = "http://wufazhuce.com/"
page = requests.get(url).content
soup = BeautifulSoup(page,'lxml')
# 每日文章
art_list = soup.find_all("p", class_="one-articulo-titulo")
art_url =  art_list[0].a.get('href')
artinfo = getartinfo(art_url)
# 每日问答
que_list = soup.find_all("p", class_="one-cuestion-titulo")
que_url =  que_list[0].a.get('href')
queinfo = getqueinfo(que_url)
que_list = list(queinfo.values())
conn = pymysql.connect(host='localhost',port=3306,user='root',password='root',db='one',charset='utf8')
cursor = conn.cursor()
cursor.execute("INSERT INTO day_art(title,curr,author,contents,add_time)VALUES('{0}','{1}','{2}','{3}','{4}');".format(artinfo['title'],artinfo['curr'],artinfo['auchor'],artinfo['contents'],artinfo['add_time']))
cursor.execute("INSERT INTO day_art(title,curr,author,contents,add_time)VALUES('{0}','{1}','{2}','{3}','{4}');".format(queinfo['title'],queinfo['curr'],queinfo['auchor'],queinfo['contents'],queinfo['add_time']))
conn.commit()
cursor.close()
conn.close()
print 'ok'   

原文发布于微信公众号 - 编程坑太多(idig88)

原文发表时间:2018-03-15

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券