当前位置:   article > 正文

Python爬取实战-二手房信息,最终将数据存储到MySQL数据库中_用mysql二手房

用mysql二手房
Python爬取链家二手房信息,将数据存储到MySQL数据库中
  • 1

在这里插入图片描述
在这里插入图片描述

import requests
import re
from op_mysql import PymysqlUtil  # 导入写好的数据库连接方法

class LianJiaSpider():
    # 初始化一个参数
    def __init__(self):
        self.url = 'https://bj.lianjia.com/ershoufang/pg{}'
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
        }

        # 创建一个操作数据库对象
        self.pyu = PymysqlUtil()

    # 发送请求
    def send_request(self, url):
        response = requests.get(url, headers=self.headers)
        if response.status_code == 200:
            return response

    # 解析数据
    def parse_content(self, response):
        li_list = re.findall(r'li class="clear LOGVIEWDATA LOGCLICKDATA"(.*?)</li>', response.text, re.S)
        for li in li_list:
            title = "".join(re.findall('<div.*?class="title".*?<a.*?>(.*?)</a>', li, re.S))
            img_url = "".join(re.findall('<img.*?class="lj-lazy".*?data-original="(.*?)"', li, re.S))
            position_info = re.findall('<div.*?class="positionInfo".*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>', li, re.S)
            if position_info:
                position_info = "-".join(position_info[0]).replace(" ", "")

            houseInfo = "".join(re.findall('<div.*?class="houseInfo".*?<span.*?</span>(.*?)</div>', li, re.S))
            area = re.search("(\d+\.*\d+)", houseInfo).group(1)

            year = re.search("(\d+)年建", houseInfo)
            if year:
                year = year.group(1)

            followInfo = "".join(re.findall('<div.*?class="followInfo".*?<span.*?</span>(.*?)</div>', li, re.S))
            follow = re.search("(\d+).*?(\d+)", followInfo).group(1)  # 关注人数
            push = re.search("(\d+).*?(\d+)", followInfo).group(2)  # 发布天数

            tag = "".join(re.findall('<div.*?class="tag">(.*?)</div>', li, re.S))
            tag = "-".join(re.findall("<span.*?>(.*?)</span>", tag))  # ["VR看装修”,"房本满两年"]

            totalPrice = "".join(re.findall('<div.*?class="totalPrice totalPrice2">.*?<span.*?>(.*?)</span>', li, re.S))

            unitPrice = "".join(re.findall('<div.*?class="unitPrice".*?<span>(.*?)</span>', li, re.S))
            unitPrice = re.search("(\d+,*\d+)", unitPrice).group(1).replace(",",".")
            print(title, img_url, position_info, area, year, follow, push, tag, totalPrice, unitPrice, houseInfo)
            res = (title, img_url, position_info, area, year, follow, push, tag, totalPrice, unitPrice, houseInfo)
            self.save_content(res)

    # 保存数据到数据库
    def save_content(self, res):
        print('正在往数据库里面存储内容')
        sql = "insert into lianjia values(0,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        self.pyu.insert(sql, args=res)
        self.pyu.Commit() # 提交

    # 开始方法
    def start(self):
        nums = input('请输入要爬取的页数:')
        for num in range(int(nums)):
            full_url = self.url.format(num+1)
            print('正在爬取第:%s 页' % full_url)
            response = self.send_request(full_url)
            if response:
                # with open('esf.html', 'w', encoding='utf-8') as f:
                #     f.write(response.text)
                self.parse_content(response)

if __name__ == '__main__':
    el = LianJiaSpider()
    el.start()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
pymysql连接数据库方法
  • 1
import pymysql

class PymysqlUtil():

    def __init__(self):
        # 创建数据库连接
        self.connection = pymysql.connect(host='localhost',
                                          user='root',
                                          password='pwd******',
                                          database='db_xxx')
        self.connection.ping(reconnect=True)
        # 创建一个游标
        self.cursor = self.connection.cursor()

    # 插入数据方法
    def insert(self,sql,args):
        try:
            self.cursor.execute(sql,args=args)
        except Exception as e:
            self.connection.close()
            print(e)

    # 提交方法
    def Commit(self):
        self.connection.commit()
		
		# 关闭连接
    def close_s(self):
        self.connection.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29

github:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/196570
推荐阅读
相关标签
  

闽ICP备14008679号