python用selenium驱动浏览器爬取天府新区二手房房价--并展示在网页上

一、

1,python version: 3.6.5;

2,Django version: 2.0.5;

3,web 应用测试工具selenium库: pip install selenium

4,浏览器驱动: webdriver,我用的chrome浏览器,需要下载对应浏览器版本的驱动器,参考https://blog.csdn.net/huilan_same/article/details/51896672

5,用bs4解析网页;

6,用mysql存储数据-注意修改settings的配置;

7,房天下成都天府新区二手房信息:http://cd.esf.fang.com/house-a016418/

8,网页html格式用到了bootstrap;

二、

原理:

1,用web自动测试工具,驱动chrome浏览器访问网页,得到目标网页后;

2,用BeautifulSoup解析网页,提取需要的信息,将提取出的信息存储在mysql数据库里,然后关闭数据库连接和浏览器;

3,最后用从mysql数据库里将存储的数据展示在网页上。

三、

代码实现:

这里只放置了views.py的代码,其他代码简单容易实现。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import pymysql


def houseinfo(request):
    connect = pymysql.connect(user='root', password='xxxxxx', host='localhost', port=3306, db='studyuser',
                              charset='utf8')
    conn = connect.cursor()
    conn.execute("create database if not exists studyuser character set utf8;")
    conn.execute("use studyuser;")
    conn.execute('drop table if exists user_room;')
    sql = """create table if not exists user_room (id INT PRIMARY KEY AUTO_INCREMENT,house_title VARCHAR(200),house_room_number VARCHAR(200),house_size VARCHAR(200),house_floor VARCHAR(200),house_diretion VARCHAR(200),
         house_location VARCHAR(200),house_total_price VARCHAR(200),house_per_price VARCHAR(200),house_link VARCHAR(200))"""
    conn.execute(sql)

    browser = webdriver.Chrome()
    wait = WebDriverWait(browser, 5)

    def get_first_page():
        browser.get('http://cd.esf.fang.com/house-a016418/')
        try:
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#closemengceng')))
            submit.click()
            print('done')
        except:
            pass
        time.sleep(1)

        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#list_D10_15 > p:nth-child(10) > a')))
        soup = BeautifulSoup(browser.page_source, 'lxml')
        items = soup.find_all('dl', class_='clearfix')
        # print(items)
        for item in items:
            try:
                house_title = item.find('span').text
            except:
                house_title = False
            try:
                house_type = item.find('p', class_='tel_shop')
                house_room_number = house_type.text[40:50].strip()
                house_size = house_type.text[91:95].strip()
                house_floor = house_type.text[100:200].strip()
                house_diretion = house_type.text[270:300].strip()
            except:
                house_room_number = False
                house_size = False
                house_floor = False
                house_diretion = False
            try:
                house_location = item.find('p', class_='add_shop').find('span').text
            except:
                house_location = False
            house_price = item.find('dd', class_='price_right')
            try:
                house_total_price = house_price.find('span', class_='red').text.strip()
            except:
                house_total_price = False
            try:
                house_per_price = house_price.find('span', class_='').text.strip()
            except:
                house_per_price = False
            try:
                url = item.find('a')

                house_url = 'http://cd.esf.fang.com' + str(url['href'])
                print(house_url)
            except:
                house_url = False
            house_link = house_url

            print(house_title)
            print(house_room_number)
            print(house_size)
            print(house_floor)
            print(house_diretion)
            print(house_location)
            print(house_total_price)
            print(house_per_price)

            if house_title and house_room_number and house_size and house_floor and house_diretion and house_location and house_total_price and house_per_price and house_link:
                conn.execute(
                    "insert into user_room (house_title,house_room_number,house_size,house_floor,house_diretion,house_location,house_total_price,house_per_price,house_link) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
                    house_title, house_room_number, house_size, house_floor, house_diretion, house_location,
                    house_total_price, house_per_price, house_link))
                connect.commit()

    def get_next_page():
        try:
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#list_D10_15 > p:nth-child(10) > a')))
            submit.click()
        except:
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#list_D10_15 > p:nth-child(12) > a')))
            submit.click()


        soup = BeautifulSoup(browser.page_source, 'lxml')
        items = soup.find_all('dl', class_='clearfix')

        for item in items:
            try:
                house_title = item.find('span').text
            except:
                house_title = False
            try:
                house_type = item.find('p', class_='tel_shop')
                house_room_number = house_type.text[40:50].strip()
                house_size = house_type.text[91:95].strip()
                house_floor = house_type.text[100:200].strip()
                house_diretion = house_type.text[270:300].strip()
            except:
                house_room_number = False
                house_size = False
                house_floor = False
                house_diretion = False
            try:
                house_location = item.find('p', class_='add_shop').find('span').text
            except:
                house_location = False
            house_price = item.find('dd', class_='price_right')
            try:
                house_total_price = house_price.find('span', class_='red').text.strip()
            except:
                house_total_price = False
            try:
                house_per_price = house_price.find('span', class_='').text.strip()
            except:
                house_per_price = False
            try:
                url = item.find('a')

                house_url = 'http://cd.esf.fang.com' + str(url['href'])
                print(house_url)
            except:
                house_url = False
            house_link = house_url

            print(house_title)
            print(house_room_number)
            print(house_size)
            print(house_floor)
            print(house_diretion)
            print(house_location)
            print(house_total_price)
            print(house_per_price)

            if house_title and house_room_number and house_size and house_floor and house_diretion and house_location and house_total_price and house_per_price and house_link:
                conn.execute(
                    "insert into user_room (house_title,house_room_number,house_size,house_floor,house_diretion,house_location,house_total_price,house_per_price,house_link) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
                    house_title, house_room_number, house_size, house_floor, house_diretion, house_location,
                    house_total_price, house_per_price, house_link))
                connect.commit()

    get_first_page()
    for i in range(2):
        get_next_page()
    conn.close()
    connect.close()
    browser.close()
    return redirect('/user/soufangwang/')


def soufangwang(request):
   house=room.objects.all()
   return render(request,'user/soufangwang.html',locals())

四、

Django工作流程原理图:

网页效果图:

有兴趣也可以看看网站其他未完成页面:

用户名:admin

密码:123

http://xiaomokuaipao.com/user/index/1/

原文发布于微信公众号 - 小末快跑(Faster_Future)

原文发表时间:2019-02-11

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券