赞
踩
以下为爬虫脚本代码 resuqsts_page.py 脚本容器目录所存放位置已做linux系统更改
- import datetime
-
- import requests
- import openpyxl
- import pymysql
- import time
-
- from lxml import etree
- from dbutils.pooled_db import PooledDB
-
- import urllib3
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-
- # create_table = """
- #
- # CREATE TABLE res_info.res_info (
- # id int(11) auto_increment NOT NULL COMMENT 'ID',
- # title varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '标题',
- # content varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '详情',
- # price varchar(30) NOT NULL COMMENT '价格',
- # image varchar(30) NOT NULL COMMENT '图片',
- # created_at timestamp DEFAULT CURRENT_TIMESTAMP NULL COMMENT '创建时间',
- # CONSTRAINT `PRIMARY` PRIMARY KEY (id)
- # )
- # ENGINE=InnoDB
- # DEFAULT CHARSET=utf8
- # COLLATE=utf8_general_ci
- # COMMENT='数据表';
- #
- # """
-
- pool = PooledDB(pymysql, maxconnections=5, host='192.168.14.93', user='root', password='abc123', database='res_info',
- charset='utf8')
-
- def res_info(page):
-
- time.sleep(2)
-
- if page == 1:
- url = 'https://cs.lianjia.com/ershoufang/'
- else:
- url = 'https://cs.lianjia.com/ershoufang/pg{}'.format(page)
-
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'}
-
- response = requests.get (url,headers=headers,verify=False)
- res = response.text
-
- # xpath 使用变量
- etree_html = etree.HTML(res)
-
- # 获取房源标题
- xpath_title = etree_html.xpath(r"//div[@class='info clear']/div[@class='title']/a/text()")
-
- # 获取房源地址
- axpath_address = etree_html.xpath(r"//div[@class='info clear']/div[@class='flood']/div[@class='positionInfo']/a/text()")
- address_values = [str(axpath_address[i]).strip() + '-' + str(axpath_address[i + 1]) for i in range(0, len(axpath_address), 2)]
-
- # 获取房源详情
- content = etree_html.xpath("//div[@class='info clear']/div[@class='address']/div[@class='houseInfo']/text()")
- content_values = [i.replace('|', ',').replace(" ", "") for i in content]
-
- # 获取房源总价格
- price = etree_html.xpath("//div[@class='info clear']/div[@class='priceInfo']/div[@class='totalPrice totalPrice2']/span/text()")
- price_values = [str(i) + '万' for i in price]
-
- # 获取房源图片
- image = etree_html.xpath("//img[@class='lj-lazy']/@data-original")
-
- # 构建数据字典格式为入库做准备
- result = [{'title': i[0], 'address': i[1], 'content': i[2], 'price': i[3], 'image': i[4]} for i in
- zip(xpath_title, address_values, content_values, price_values, image)]
-
- print('----------------正在采集第{}页数据----------------'.format(page))
- # 创建一个新的工作簿
- workbook = openpyxl.Workbook()
-
- # 创建一个新的工作表
- worksheet = workbook.active
-
- # 添加表头
- worksheet.append(['房源标题', '房源地址', '房源详情', '房源价格', '房源图片'])
-
- # 将数据写入工作表
- for i in result:
- worksheet.append([i['title'], i['address'], i['content'], i['price'], i['image']])
-
- # 保存工作簿为 Excel 文件
- workbook.save(r'/image_docker/static/房源第{}页数据.xlsx'.format(page))
-
- # 连接
- conn = pool.connection()
-
- for i in result:
- sql = 'INSERT INTO `res_info` (`title`, `address`, `content`, `price`, `image`) VALUES ("%s", "%s", "%s", "%s", "%s")' % (
- i['title'], i['address'], i['content'], i['price'], i['image']
- )
- try:
- # 执行插入操作
- cursor = conn.cursor()
- cursor.execute(sql)
- conn.commit()
- # 提交事务
- print('数据插入成功!')
- except Exception as e:
- # 处理其他异常
- pass
- # print('数据插入失败!数据库中数据已存在!', str(e))
-
- print('数据插入成功!')
-
- print('----------------采集第{}页数据完成----------------'.format(page))
- return result
-
-
-
- import threading
-
- def execute_res_info(start_page, end_page):
- for page in range(start_page, end_page + 1):
- res_info(page)
-
- while 1:
-
- # 创建两个线程来执行1到10页和2到20页的记录
- thread1 = threading.Thread(target=execute_res_info, args=(1, 10))
- thread2 = threading.Thread(target=execute_res_info, args=(2, 20))
-
- # 启动线程
- thread1.start()
- thread2.start()
-
- # 等待线程执行完毕
- thread1.join()
- thread2.join()
-
- print("所有记录执行完成",datetime.datetime.now())
-
- time.sleep(60)
-
部署所用dockerfile
- # python版本,可根据需求进行修改
- FROM python:3.7
-
- # 安装 tzdata 包,并设置上海时区
-
-
- # 设置时区为中国标准时间
- ENV TZ=Asia/Shanghai
-
-
-
- # 创建工作目录 /image_docker,并将python程序和依赖添加到镜像
- RUN mkdir /image_docker
- COPY resuqsts_page.py /image_docker/
- COPY requirements.txt /image_docker/
- COPY static /image_docker/static/
-
- # 设置工作目录为 /image_docker
- WORKDIR /image_docker
-
- # 安装 Python 依赖包
- RUN pip install --no-cache-dir -r requirements.txt
-
- # ENTRYPOINT 将 Socket_Client.py 设置为入口点程序
- ENTRYPOINT ["python", "resuqsts_page.py"]
-
- # CMD 将 Image_ex.py 设置为默认执行的命令
- CMD ["python", "resuqsts_page.py"]
-
下列是所用到的包放在requirements.txt
- PyMySQL~=1.0.3
- DBUtils~=3.0.3
- requests
- openpyxl
- lxml
部署根目录以及文件存放位置
docker 容器运行命令
https://chat10.aichatos.xyz/#/chat/1691139414994
docker run --name res_info -d --dns=8.8.8.8 res_info
仅供参考!!!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。