本次爬取用到的知识点有:
1. selenium
2. pymysql
3 pyquery
1. 打开某宝首页, 输入"男装"后点击"搜索", 则跳转到"男装"的搜索界面.
2. 空白处"右击"再点击"检查"审查网页元素, 点击"Network".
1) 找到对应的URL, URL里的参数正是Query String Parameters的参数, 且请求方式是GET
2) 我们请求该URL得到内容就是"Response"里的内容, 那么点击它来确认信息.
3) 下拉看到"男装"字样, 那么再往下找, 并没有发现有关"男装"的商品信息.
4) 任意复制一个商品信息, 空白处右击再点击"查看网页源代码", 在源码查找该商品, 即可看到该商品的信息.
5) 对比网页源代码和"Response"响应内容, 发现源代码<script>..........</script>中的商品信息被替换, 这便是采用了JS加密
6) 如果去请求上面的URL, 得到的则是加密过的信息, 这时就可以利用Selenium库来模拟浏览器, 进而得到商品信息.
1. 请求网站
1 # -*- coding: utf-8 -*-
2 from selenium import webdriver #从selenium导入浏览器驱动
3 browser = webdriver.Chrome() #声明驱动对象, 即Chrome浏览器
4 def get_one_page():
5 '''获取单个页面'''
6 browser.get("https://www.xxxxx.com") #请求网站
2. 输入"男装", 在输入之前, 需要判断输入框是否存在, 如果存在则输入"男装", 不存在则等待显示成功.
1 # -*- coding: utf-8 -*-
2 from selenium import webdriver
3 from selenium.webdriver.common.by import By #导入元素定位方法模块
4 from selenium.webdriver.support.ui import WebDriverWait #导入等待判断模块
5 from selenium.webdriver.support import expected_conditions as EC #导入判断条件模块
6 browser = webdriver.Chrome()
7 def get_one_page():
8 '''获取单个页面'''
9 browser.get("https://www.xxxxx.com")
10 input = WebDriverWait(browser,10).until( #等待判断
11 EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))) #若输入框显示成功,则获取,否则等待
12 input.send_keys("男装") #输入商品名称
3. 下一步就是点击"搜索"按钮, 按钮具有属性: 可点击, 那么加入判断条件.
1 # -*- coding: utf-8 -*-
2 from selenium import webdriver
3 from selenium.webdriver.common.by import By
4 from selenium.webdriver.support.ui import WebDriverWait
5 from selenium.webdriver.support import expected_conditions as EC
6 browser = webdriver.Chrome()
7 def get_one_page():
8 '''获取单个页面'''
9 browser.get("https://www.xxxxx.com")
10 input = WebDriverWait(browser,10).until(
11 EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))) #
12 input.send_keys("男装")
13 button = WebDriverWait(browser,10).until( #等待判断
14 EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button"))) #若按钮可点击, 则获取, 否则等待
15 button.click() #点击按钮
4. 获取总的页数, 同样加入等待判断.
1 # -*- coding: utf-8 -*-
2 import re
3 from selenium import webdriver
4 from selenium.common.exceptions import TimeoutException
5 from selenium.webdriver.common.by import By
6 from selenium.webdriver.support.ui import WebDriverWait
7 from selenium.webdriver.support import expected_conditions as EC
8 browser = webdriver.Chrome()
9 def get_one_page():
10 '''获取单个页面'''
11 browser.get("https://www.xxxxx.com")
12 input = WebDriverWait(browser, 10).until(
13 EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))
14 input.send_keys("男装")
15 button = WebDriverWait(browser, 10).until(
16 EC.element_to_be_clickable(
17 (By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button")))
18 button.click()
19 pages = WebDriverWait(browser, 10).until( # 等待判断
20 EC.presence_of_element_located(
21 (By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))) # 若总页数加载成功,则获取总页数,否则等待
22 return pages.text
23 def main():
24 pages = get_one_page()
25 print(pages)
26 if __name__ == '__main__':
27 main()
5. 打印出来的不是我们想要的结果, 利用正则表达式获取, 最后再利用try...except捕捉异常
1 # -*- coding: utf-8 -*-
2 import re
3 from selenium import webdriver
4 from selenium.common.exceptions import TimeoutException
5 from selenium.webdriver.common.by import By
6 from selenium.webdriver.support.ui import WebDriverWait
7 from selenium.webdriver.support import expected_conditions as EC
8 browser = webdriver.Chrome()
9 def get_one_page():
10 '''获取单个页面'''
11 try:
12 browser.get("https://www.xxxxx.com")
13 input = WebDriverWait(browser,10).until(
14 EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))
15 input.send_keys("男装")
16 button = WebDriverWait(browser,10).until(
17 EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
18 button.click()
19 pages = WebDriverWait(browser,10).until(
20 EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
21 return pages.text
22 except TimeoutException:
23 return get_one_page() #如果超时,继续获取
24 def main():
25 pages = get_one_page()
26 pages = int(re.compile("(\d+)").findall(pages)[0]) #采用正则表达式提取文本中的总页数
27 print(pages)
28 if __name__ == '__main__':
29 main()
关于Selenium的更多内容,可参看官方文档https://selenium-python.readthedocs.io/waits.html
采用获取"到第 页"输入框方式, 切换到下一页, 同样是等待判断
需要注意的是, 最后要加入判断: 高亮是否是当前页
1 def get_next_page(page):
2 try:
3 input = WebDriverWait(browser, 10).until(
4 EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) # 若输入框加载成功,则获取,否则等待
5 input.send_keys(page) # 输入页码
6 button = WebDriverWait(browser, 10).until(
7 EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) # 若按钮可点击,则获取,否则等待
8 button.click() # 点击按钮
9 WebDriverWait(browser,10).until(
10 EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page))) # 判断高亮是否是当前页
11 except TimeoutException: # 超时, 继续请求
12 return get_next_page(page)
13 def main():
14 pages = get_one_page()
15 pages = int(re.compile("(\d+)").findall(pages)[0])
16 for page in range(1,pages+1):
17 get_next_page(page)
18 if __name__ == '__main__':
19 main()
首先, 判断信息是否加载成功, 紧接着获取源码并初始化, 进而解析.
需要注意的是, 在"get_one_page"和"get_next_page"中调用之后, 才可执行
1 def get_info():
2 """获取详情"""
3 WebDriverWait(browser,20).until(EC.presence_of_element_located((
4 By.CSS_SELECTOR,"#mainsrp-itemlist .items .item"))) #判断商品信息是否加载成功
5 text = browser.page_source #获取网页源码
6 html = pq(text) #初始化网页源码
7 items = html('#mainsrp-itemlist .items .item').items() #采用items方法会得到生成器
8 for item in items: #遍历每个节点对象
9 data = []
10 image = item.find(".pic .img").attr("data-src") #用find方法查找子孙节点,用attr方法获取属性名称
11 price = item.find(".price").text().strip().replace("\n","") #用text方法获取文本,strip()去掉前后字符串,默认是空格
12 deal = item.find(".deal-cnt").text()[:-2]
13 title = item.find(".title").text().strip()
14 shop = item.find(".shop").text().strip()
15 location = item.find(".location").text()
16 data.append([shop, location, title, price, deal, image])
17 print(data)
1 def save_to_mysql(data):
2 """存储到数据库"""
3 # 创建数据库连接对象
4 db= pymysql.connect(host = "localhost",user = "root",password = "password",port = 3306, db = "spiders",charset = "utf8")
5 # 获取游标
6 cursor = db.cursor()
7 #创建数据库
8 cursor.execute("CREATE TABLE IF NOT EXISTS {0}(shop VARCHAR(20),location VARCHAR(10),title VARCHAR(255),price VARCHAR(20),deal VARCHAR(20), image VARCHAR(255))".format("男装"))
9 #SQL语句
10 sql = "INSERT INTO {0} values(%s,%s,%s,%s,%s,%s)".format("男装")
11 try:
12 #传入参数sql,data
13 if cursor.execute(sql,data):
14 #插入数据库
15 db.commit()
16 print("********已入库**********")
17 except:
18 print("#########入库失败#########")
19 #回滚,相当什么都没做
20 db.rollback()
21 #关闭数据库
22 db.close()
1 # -*- coding: utf-8 -*-
2 import re
3 import pymysql
4 from selenium import webdriver
5 from selenium.common.exceptions import TimeoutException
6 from selenium.webdriver.common.by import By
7 from selenium.webdriver.support.ui import WebDriverWait
8 from selenium.webdriver.support import expected_conditions as EC
9 from pyquery import PyQuery as pq
10 browser = webdriver.Chrome()
11 def get_one_page(name):
12 '''获取单个页面'''
13 print("-----------------------------------------------获取第一页-------------------------------------------------------")
14 try:
15 browser.get("https://www.xxxxx.com")
16 input = WebDriverWait(browser,10).until(
17 EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))
18 input.send_keys(name)
19 button = WebDriverWait(browser,10).until(
20 EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
21 button.click()
22 pages = WebDriverWait(browser,10).until(
23 EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
24 print("----即将解析第一页信息----")
25 get_info(name)
26 print("----第一页信息解析完成----")
27 return pages.text
28 except TimeoutException:
29 return get_one_page(name)
30 def get_next_page(page,name):
31 """获取下一页"""
32 print("---------------------------------------------------正在获取第{0}页----------------------------------------".format(page))
33 try:
34 input = WebDriverWait(browser, 10).until(
35 EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
36 input.send_keys(page)
37 button = WebDriverWait(browser, 10).until(
38 EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
39 button.click()
40 WebDriverWait(browser,10).until(
41 EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page)))
42 print("-----即将解析第{0}页信息-----".format(page))
43 get_info(name)
44 print("-----第{0}页信息解析完成-----".format(page))
45 except TimeoutException:
46 return get_next_page(page,name)
47 def get_info(name):
48 """获取详情"""
49 WebDriverWait(browser,20).until(EC.presence_of_element_located((
50 By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
51 text = browser.page_source
52 html = pq(text)
53 items = html('#mainsrp-itemlist .items .item').items()
54 for item in items:
55 data = []
56 image = item.find(".pic .img").attr("data-src")
57 price = item.find(".price").text().strip().replace("\n","")
58 deal = item.find(".deal-cnt").text()[:-2]
59 title = item.find(".title").text().strip()
60 shop = item.find(".shop").text().strip()
61 location = item.find(".location").text()
62 data.append([shop, location, title, price, deal, image])
63 for dt in data:
64 save_to_mysql(dt,name)
65 def save_to_mysql(data,name):
66 """存储到数据库"""
67 db= pymysql.connect(host = "localhost",user = "root",password = "password",port = 3306, db = "spiders",charset = "utf8")
68 cursor = db.cursor()
69 cursor.execute("CREATE TABLE IF NOT EXISTS {0}(shop VARCHAR(20),location VARCHAR(10),title VARCHAR(255),price VARCHAR(20),deal VARCHAR(20), image VARCHAR(255))".format(name))
70 sql = "INSERT INTO {0} values(%s,%s,%s,%s,%s,%s)".format(name)
71 try:
72 if cursor.execute(sql,data):
73 db.commit()
74 print("********已入库**********")
75 except:
76 print("#########入库失败#########")
77 db.rollback()
78 db.close()
79 def main(name):
80 pages = get_one_page(name)
81 pages = int(re.compile("(\d+)").findall(pages)[0])
82 for page in range(1,pages+1):
83 get_next_page(page,name)
84 if __name__ == '__main__':
85 name = "男装"
86 main(name)
以上是对学习的总结, 若有不对的地方, 还请指正, 谢谢!