赞
踩
Python爬取链家二手房信息,将数据存储到MySQL数据库中
import requests import re from op_mysql import PymysqlUtil # 导入写好的数据库连接方法 class LianJiaSpider(): # 初始化一个参数 def __init__(self): self.url = 'https://bj.lianjia.com/ershoufang/pg{}' self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36" } # 创建一个操作数据库对象 self.pyu = PymysqlUtil() # 发送请求 def send_request(self, url): response = requests.get(url, headers=self.headers) if response.status_code == 200: return response # 解析数据 def parse_content(self, response): li_list = re.findall(r'li class="clear LOGVIEWDATA LOGCLICKDATA"(.*?)</li>', response.text, re.S) for li in li_list: title = "".join(re.findall('<div.*?class="title".*?<a.*?>(.*?)</a>', li, re.S)) img_url = "".join(re.findall('<img.*?class="lj-lazy".*?data-original="(.*?)"', li, re.S)) position_info = re.findall('<div.*?class="positionInfo".*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>', li, re.S) if position_info: position_info = "-".join(position_info[0]).replace(" ", "") houseInfo = "".join(re.findall('<div.*?class="houseInfo".*?<span.*?</span>(.*?)</div>', li, re.S)) area = re.search("(\d+\.*\d+)", houseInfo).group(1) year = re.search("(\d+)年建", houseInfo) if year: year = year.group(1) followInfo = "".join(re.findall('<div.*?class="followInfo".*?<span.*?</span>(.*?)</div>', li, re.S)) follow = re.search("(\d+).*?(\d+)", followInfo).group(1) # 关注人数 push = re.search("(\d+).*?(\d+)", followInfo).group(2) # 发布天数 tag = "".join(re.findall('<div.*?class="tag">(.*?)</div>', li, re.S)) tag = "-".join(re.findall("<span.*?>(.*?)</span>", tag)) # ["VR看装修”,"房本满两年"] totalPrice = "".join(re.findall('<div.*?class="totalPrice totalPrice2">.*?<span.*?>(.*?)</span>', li, re.S)) unitPrice = "".join(re.findall('<div.*?class="unitPrice".*?<span>(.*?)</span>', li, re.S)) unitPrice = re.search("(\d+,*\d+)", unitPrice).group(1).replace(",",".") print(title, img_url, position_info, area, year, follow, push, tag, totalPrice, unitPrice, houseInfo) res = (title, img_url, position_info, area, year, follow, push, tag, totalPrice, unitPrice, houseInfo) self.save_content(res) # 保存数据到数据库 def save_content(self, res): print('正在往数据库里面存储内容') sql = "insert into lianjia values(0,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" self.pyu.insert(sql, args=res) self.pyu.Commit() # 提交 # 开始方法 def start(self): nums = input('请输入要爬取的页数:') for num in range(int(nums)): full_url = self.url.format(num+1) print('正在爬取第:%s 页' % full_url) response = self.send_request(full_url) if response: # with open('esf.html', 'w', encoding='utf-8') as f: # f.write(response.text) self.parse_content(response) if __name__ == '__main__': el = LianJiaSpider() el.start()
pymysql连接数据库方法
import pymysql class PymysqlUtil(): def __init__(self): # 创建数据库连接 self.connection = pymysql.connect(host='localhost', user='root', password='pwd******', database='db_xxx') self.connection.ping(reconnect=True) # 创建一个游标 self.cursor = self.connection.cursor() # 插入数据方法 def insert(self,sql,args): try: self.cursor.execute(sql,args=args) except Exception as e: self.connection.close() print(e) # 提交方法 def Commit(self): self.connection.commit() # 关闭连接 def close_s(self): self.connection.close()
github:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。