当前位置:   article > 正文

Python多线程爬取链家房源,保存excel表格,入库实现数据可视化分析,并对脚本进行部署_多进程爬取链家数据

多进程爬取链家数据

以下为爬虫脚本代码 resuqsts_page.py  脚本容器目录所存放位置已做linux系统更改

  1. import datetime
  2. import requests
  3. import openpyxl
  4. import pymysql
  5. import time
  6. from lxml import etree
  7. from dbutils.pooled_db import PooledDB
  8. import urllib3
  9. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  10. # create_table = """
  11. #
  12. # CREATE TABLE res_info.res_info (
  13. # id int(11) auto_increment NOT NULL COMMENT 'ID',
  14. # title varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '标题',
  15. # content varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '详情',
  16. # price varchar(30) NOT NULL COMMENT '价格',
  17. # image varchar(30) NOT NULL COMMENT '图片',
  18. # created_at timestamp DEFAULT CURRENT_TIMESTAMP NULL COMMENT '创建时间',
  19. # CONSTRAINT `PRIMARY` PRIMARY KEY (id)
  20. # )
  21. # ENGINE=InnoDB
  22. # DEFAULT CHARSET=utf8
  23. # COLLATE=utf8_general_ci
  24. # COMMENT='数据表';
  25. #
  26. # """
  27. pool = PooledDB(pymysql, maxconnections=5, host='192.168.14.93', user='root', password='abc123', database='res_info',
  28. charset='utf8')
  29. def res_info(page):
  30. time.sleep(2)
  31. if page == 1:
  32. url = 'https://cs.lianjia.com/ershoufang/'
  33. else:
  34. url = 'https://cs.lianjia.com/ershoufang/pg{}'.format(page)
  35. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'}
  36. response = requests.get (url,headers=headers,verify=False)
  37. res = response.text
  38. # xpath 使用变量
  39. etree_html = etree.HTML(res)
  40. # 获取房源标题
  41. xpath_title = etree_html.xpath(r"//div[@class='info clear']/div[@class='title']/a/text()")
  42. # 获取房源地址
  43. axpath_address = etree_html.xpath(r"//div[@class='info clear']/div[@class='flood']/div[@class='positionInfo']/a/text()")
  44. address_values = [str(axpath_address[i]).strip() + '-' + str(axpath_address[i + 1]) for i in range(0, len(axpath_address), 2)]
  45. # 获取房源详情
  46. content = etree_html.xpath("//div[@class='info clear']/div[@class='address']/div[@class='houseInfo']/text()")
  47. content_values = [i.replace('|', ',').replace(" ", "") for i in content]
  48. # 获取房源总价格
  49. price = etree_html.xpath("//div[@class='info clear']/div[@class='priceInfo']/div[@class='totalPrice totalPrice2']/span/text()")
  50. price_values = [str(i) + '万' for i in price]
  51. # 获取房源图片
  52. image = etree_html.xpath("//img[@class='lj-lazy']/@data-original")
  53. # 构建数据字典格式为入库做准备
  54. result = [{'title': i[0], 'address': i[1], 'content': i[2], 'price': i[3], 'image': i[4]} for i in
  55. zip(xpath_title, address_values, content_values, price_values, image)]
  56. print('----------------正在采集第{}页数据----------------'.format(page))
  57. # 创建一个新的工作簿
  58. workbook = openpyxl.Workbook()
  59. # 创建一个新的工作表
  60. worksheet = workbook.active
  61. # 添加表头
  62. worksheet.append(['房源标题', '房源地址', '房源详情', '房源价格', '房源图片'])
  63. # 将数据写入工作表
  64. for i in result:
  65. worksheet.append([i['title'], i['address'], i['content'], i['price'], i['image']])
  66. # 保存工作簿为 Excel 文件
  67. workbook.save(r'/image_docker/static/房源第{}页数据.xlsx'.format(page))
  68. # 连接
  69. conn = pool.connection()
  70. for i in result:
  71. sql = 'INSERT INTO `res_info` (`title`, `address`, `content`, `price`, `image`) VALUES ("%s", "%s", "%s", "%s", "%s")' % (
  72. i['title'], i['address'], i['content'], i['price'], i['image']
  73. )
  74. try:
  75. # 执行插入操作
  76. cursor = conn.cursor()
  77. cursor.execute(sql)
  78. conn.commit()
  79. # 提交事务
  80. print('数据插入成功!')
  81. except Exception as e:
  82. # 处理其他异常
  83. pass
  84. # print('数据插入失败!数据库中数据已存在!', str(e))
  85. print('数据插入成功!')
  86. print('----------------采集第{}页数据完成----------------'.format(page))
  87. return result
  88. import threading
  89. def execute_res_info(start_page, end_page):
  90. for page in range(start_page, end_page + 1):
  91. res_info(page)
  92. while 1:
  93. # 创建两个线程来执行1到10页和2到20页的记录
  94. thread1 = threading.Thread(target=execute_res_info, args=(1, 10))
  95. thread2 = threading.Thread(target=execute_res_info, args=(2, 20))
  96. # 启动线程
  97. thread1.start()
  98. thread2.start()
  99. # 等待线程执行完毕
  100. thread1.join()
  101. thread2.join()
  102. print("所有记录执行完成",datetime.datetime.now())
  103. time.sleep(60)

部署所用dockerfile

  1. # python版本,可根据需求进行修改
  2. FROM python:3.7
  3. # 安装 tzdata 包,并设置上海时区
  4. # 设置时区为中国标准时间
  5. ENV TZ=Asia/Shanghai
  6. # 创建工作目录 /image_docker,并将python程序和依赖添加到镜像
  7. RUN mkdir /image_docker
  8. COPY resuqsts_page.py /image_docker/
  9. COPY requirements.txt /image_docker/
  10. COPY static /image_docker/static/
  11. # 设置工作目录为 /image_docker
  12. WORKDIR /image_docker
  13. # 安装 Python 依赖包
  14. RUN pip install --no-cache-dir -r requirements.txt
  15. # ENTRYPOINT 将 Socket_Client.py 设置为入口点程序
  16. ENTRYPOINT ["python", "resuqsts_page.py"]
  17. # CMD 将 Image_ex.py 设置为默认执行的命令
  18. CMD ["python", "resuqsts_page.py"]

下列是所用到的包放在requirements.txt

  1. PyMySQL~=1.0.3
  2. DBUtils~=3.0.3
  3. requests
  4. openpyxl
  5. lxml

部署根目录以及文件存放位置

docker 容器运行命令

https://chat10.aichatos.xyz/#/chat/1691139414994

docker run --name res_info -d  --dns=8.8.8.8  res_info

仅供参考!!!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/375046
推荐阅读
相关标签
  

闽ICP备14008679号