赞
踩
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class LianjiaItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pic = scrapy.Field() title = scrapy.Field() detail_url = scrapy.Field() price = scrapy.Field() publish_info = scrapy.Field() pic_list = scrapy.Field() house_code = scrapy.Field() ucid = scrapy.Field() agent_name = scrapy.Field() agent_phone = scrapy.Field()
# -*- coding: utf-8 -*- import scrapy from LianJia.items import LianjiaItem import re import json import requests class LianjiaSpider(scrapy.Spider): name = 'lianjia' allowed_domains = ['lianjia.com'] start_urls = ['https://www.lianjia.com/city/'] def parse(self, response): # 获取到的是新房的url city_url_list = response.xpath("//div[@class='city_province']//li/a/@href").extract() # print(city_url_list) city_name_list = response.xpath("//div[@class='city_province']//li/a/text()").extract() for index in range(len(city_url_list)): city_name = city_name_list[index] city_url = city_url_list[index] # print(city_url) # 城市首字母 city_alp = re.findall(r"https://(\w*).", city_url)[0] # print(city_alp) # 拼接租房城市url city_url = "https://" + city_alp + ".lianjia.com/zufang/" # print("--------------------{}开始下载-------------------------------".format(city_name)) yield scrapy.Request(url=city_url, callback=self.get_area_url) def get_area_url(self, response): # print(response.body.decode("utf-8")) # 获取城区url area_url_list = response.xpath("//li[@data-type='district'][position()>1]/a/@href").extract() for area_url in area_url_list: area_url = re.findall(r"(.*)/zufang/", response.url)[0] + area_url # print(area_url) yield scrapy.Request(url=area_url, callback=self.get_business_url) def get_business_url(self, response): # 获取商圈url business_url_list = response.xpath("//li[@data-type='bizcircle'][position()>1]/a/@href").extract() # print(business_url_list) for business_url in business_url_list: business_url = re.findall(r"(.*)/zufang/", response.url)[0] + business_url # print(business_url) yield scrapy.Request(url=business_url, callback=self.get_page_url) def get_page_url(self, response): # 获取最大页码 max_page = response.xpath("//div[@class='content__pg']/@data-totalpage").extract() max_page = int(max_page[0]) if max_page else 0 # print(max_page) # 遍历最大页 拼接完整的page_url # ---------page=0时 不会执行下面---------- for page in range(max_page): page_url = response.url + "pg{}/#contentList".format(page + 1) # print(page_url) yield scrapy.Request(url=page_url, callback=self.get_page_data) def get_page_data(self, response): # 缩小范围 fang_xml_list = response.xpath("//div[@class='content__list']/div") # print(fang_xml_list) for fang_xml in fang_xml_list: # 获取图片 pic = fang_xml.xpath(".//img/@data-src").extract() pic = pic[0] if pic else '' # print(pic) # 获取标题 title = fang_xml.xpath(".//p[@class='content__list--item--title twoline']/a/text()").extract()[0].strip() # 获取详情url detail_url = fang_xml.xpath(".//p[@class='content__list--item--title twoline']/a/@href").extract()[0] detail_url = "https://bj.lianjia.com" + detail_url # print(title) # 获取价格 price = fang_xml.xpath(".//em/text()").extract()[0] # print(price) item = LianjiaItem() item["pic"] = pic item["title"] = title item["detail_url"] = detail_url item["price"] = price # print(item) yield scrapy.Request(url=detail_url, callback=self.get_detail_data, meta={"data": item}, dont_filter=True) def get_detail_data(self, response): item = response.meta["data"] # 获取发布信息 publish_info = response.xpath("//ul/li[contains(text(), '发布')]/text()").extract() publish_info = publish_info[0] if publish_info else '' # print(publish_info) # 获取图片信息 pic_list = response.xpath("//ul[@class='content__article__slide__wrapper']/div/img/@data-src").extract() # print(pic_list) # 获取房源编号 house_code = re.findall(r"/zufang/(.*?).html", response.url)[0] # print(house_code) # 获取ucid ucid = response.xpath("//span[@class='contact__im im__online']/@data-im_id").extract() # print(ucid) if ucid: ucid = ucid[0] # 拼接完整的经纪人接口 brokers_url = "https://bj.lianjia.com/zufang/aj/house/brokers?" agent_api = brokers_url + "house_codes={}&position=bottom&ucid={}".format(house_code, ucid) # print(agent_api) item["publish_info"] = publish_info item["pic_list"] = pic_list item["house_code"] = house_code item["ucid"] = ucid yield scrapy.Request(url=agent_api, callback=self.get_agent_data, meta={"data": item}, dont_filter=True) # 获取经纪人信息 def get_agent_data(self, response): # 将response对象转成json对象 result = response.body.decode("utf-8") json_data = json.loads(result) # print(json_data) item = response.meta["data"] house_code = item.get("house_code") # 经纪人姓名 agent_name = json_data.get("data").get(house_code).get(house_code).get("contact_name") # print(agent_name) # 经纪人电话 agent_phone = json_data.get("data").get(house_code).get(house_code).get("tp_number") # print(agent_phone) item["agent_name"] = agent_name item["agent_phone"] = agent_phone yield item
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo import pymysql import redis import json # class LianjiaPipeline(object): # def __init__(self): # self.count = 1 # # def process_item(self, item, spider): # print(self.count, dict(item)) # self.count += 1 # return item # 插入redis数据库 class RedisPipeline(object): def __init__(self): self.count = 1 self.r = redis.Redis(host="localhost", port="6379", db=3) def process_item(self, item, spider): # print(item) item_dict = dict(item) print(self.count, item_dict) item_str = json.dumps(item_dict) self.r.lpush("lianjia", item_str) # print("insert successfully") self.count += 1 return item # 插入mongodb class MongodbPipeline(object): def __init__(self): mongo_client = pymongo.MongoClient("localhost", 27017) self.db = mongo_client.lianjia def process_item(self, item, spider): item = dict(item) self.db.lianjia.insert(item) return item # 插入mysql class MysqlPipeline(object): def __init__(self): self.conn = pymysql.connect("localhost", "root", "123456", "lianjia") self.cursor = self.conn.cursor() def process_item(self, item, spider): title = item["title"] # print(title) price = item["price"] agent_name = item["agent_name"] agent_phone = item["agent_phone"] import time # 时间戳 数据监控时用 refresh_time = int(time.time()) sql = "insert into fang(title, price, agent_name, agent_phone, refresh_time) values('{}', '{}', '{}', '{}', '{}')"\ .format(title, price, agent_name, agent_phone, refresh_time) try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def __del__(self): self.cursor.close() self.conn.close()
4.main.py (项目执行脚本)
from scrapy import cmdline
cmdline.execute("scrapy crawl lianjia --nolog".split())
# cmdline.execute("scrapy crawl lianjia".split())
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。