之前做了招聘会信息提醒之微信机器人,微信群已经建了5个,总体的用户大概有不到两千人。小目标是让西电今年找工作的人都能用上。 和几个小伙伴一拍即合,做个小程序吧! 老生长谈,爬虫的三步走:
今天就做了这第三步。作为小程序的数据来源。
建库建表。
要注意的问题:
Python中先导入PyMySQL,链接语句(私密信息已做处理):
connection = pymysql.connect(host='XXXX', user='XXX', password='XXX', db='campushire', charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
cur = connection.cursor()
cur.execute("USE campushire")
这样就可以链接到数据库,选择自己要插入数据的表。
数据好插入,复杂的地方在于如何插入自己想要的数据,剔除掉不想要的数据。简单的几个函数,这里还是要再次提及,之前有使用过,但是又忘了。。。
[s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "windowClose"})]
嗯,这个的语句的意思是,去除指定标签下的所以内容。这里用于剔除睿思具体内容之前的编辑信息,以及就业信息网上的关闭窗口等等诸如此类不是正文的信息。
插入语句:
cur.execute("INSERT INTO hireinfo (title,links,viewnum,class,contents) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")", (P,L,V,C,R))
cur.connection.commit()
time.sleep(3)
插入,提交。延时还是要做的,好像是之前访问数据库太快导致中断???
需要注意的是,表里列的名称要写对,以及VALUES的个数,还有后面要插入的数据与之前的一一对应。
基本这样就OK了。
吃饭去,饿死了。
代码:
# coding:utf - 8
import urllib.request
import sys, io
from bs4 import BeautifulSoup
import itchat
from datetime import datetime
import time
import re
import _thread
from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
import time # 导入包
import urllib.request
import pymysql
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
# from __future__ import with_statement
import contextlib
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
import sys
def getPageContent(url):
headers = {'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'}
req = urllib.request.Request(url=url,headers=headers)
try:
res = urllib.request.urlopen(req)
except urllib.error.URLError as e:
return e
page_content = res.read()
page_content=BeautifulSoup(page_content,"lxml")
return page_content
itchat.send_msg(jobinfo2, userName)
def make_tiny(url):
request_url = ('http://tinyurl.com/api-create.php?' +
urlencode({'url': url}))
with contextlib.closing(urlopen(request_url)) as response:
return response.read().decode('utf-8')
def timer(n):
itchat.auto_login(hotReload=True) # 可设置hotReload = True
time.sleep(n)
def rs():
pageURL = set()
# job_rs = '?今日睿思校园招聘'+"\n"+'[机器喵自动获取,仅供参考]'+"\n"+'有问题请艾特群主@肖洒'+"\n"+'更多有意思的小玩意'+"\n"+'?https://x-nicolo.github.io/'+"\n"+"--------------------"+"\n"
for i in range(1, 10):
pages = 'http://rsbbs.xidian.edu.cn/forum.php?mod=forumdisplay&fid=554&page=' + str(i) + '&mobile=2'
if pages not in pageURL:
headers = {
'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
'Referer': r'http://rsbbs.xidian.edu.cn',
'Connection': 'keep-alive'}
req = request.Request(pages, headers=headers)
html = request.urlopen(req)
bsObj = BeautifulSoup(html.read(), "lxml")
[s.extract() for s in bsObj.findAll('i', attrs={'class': "pstatus"})]
tiezi = bsObj.findAll("ul")
for tiaos in tiezi:
for tiao in tiaos.findAll('a'):
for person in tiao.findAll('span', attrs={'class': "by"}):
T = person.get_text().strip()
[s.extract() for s in tiao.findAll('span', attrs={'class': "by"})]
# title
P = tiao.get_text().strip().strip('【散金币】').strip('【金币】').strip('(散金币)').strip('(金币)')
if 'href' in tiao.attrs:
try:
tiao_links = "http://rsbbs.xidian.edu.cn/" + tiao.attrs['href']
tiao_html = urlopen(tiao_links)
L=str(make_tiny(tiao_links))
tiao_bsObj = BeautifulSoup(tiao_html.read(), "lxml")
[s.extract() for s in tiao_bsObj.findAll('i', attrs={'class': "pstatus"})]
content = tiao_bsObj.findAll("div", {"class": "message"})[0]
R = content.get_text().strip()
V=0
C='rs'
except (ValueError, IndexError) as e:
pass
pageURL.add(pages)
cur.execute("INSERT INTO hireinfo (title,links,contents,viewnum,class) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")", (P,L,R,V,C))
cur.connection.commit()
time.sleep(3) # 设置时间间隔为3秒
def xdjobs():
# job_rs = '?就业信息网最新10条信息'+"\n"+'[机器喵自动获取,仅供参考]'+"\n"+'有问题请艾特群主@肖洒'+"\n"+'更多有意思的小玩意'+"\n"+'?https://x-nicolo.github.io/'+"\n"+"--------------------"+"\n"
url = 'http://job.xidian.edu.cn/html/zpxx/jobs/'
headers = {'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'}
req = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(req)
page_content = res.read()
page_content=BeautifulSoup(page_content,"lxml")
job_content = page_content.find("div", {"class": "content"})
rows = job_content.findAll("span")
job_info=[]
for row in rows:
for cell in row.findAll('a'):
info=cell.get_text()
P = cell.get_text().strip()
tiao_links = "http://job.xidian.edu.cn" + cell.attrs['href']
L=str(make_tiny(tiao_links))
tiao_html = urllib.request.Request(url=tiao_links,headers=headers)
tiao_res = urllib.request.urlopen(tiao_html)
tiao_bsObj = BeautifulSoup(tiao_res.read(), "lxml")
[s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "windowClose"})]
[s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "arcInfo"})]
[s.extract() for s in tiao_bsObj.findAll('a', attrs={'href': "javascript:window.print()"})]
[s.extract() for s in tiao_bsObj.findAll('a', attrs={'href': "javascript:window.close()"})]
[s.extract() for s in tiao_bsObj.findAll('div', attrs={'class': "context"})]
content = tiao_bsObj.findAll("div", {"class": "content"})[0]
R = content.get_text().strip()
# print(R)
V=0
C="就业信息网"
cur.execute("INSERT INTO hireinfo (title,links,viewnum,class,contents) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")", (P,L,V,C,R))
cur.connection.commit()
time.sleep(3)
connection = pymysql.connect(host='XXX', user='XXX', password='XXX', db='campushire', charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
cur = connection.cursor()
cur.execute("USE campushire")
xdjobs()
rs()