赞
踩
本次爬取是安居客的房产信息:话不多说,直接上代码!
一:爬虫板块:
1.运行文件:run.py
from scrapy import cmdline
cmdline.execute(‘scrapy crawl anjuke_shanghai’.split())
2.网页解析:anjuke_shanghai.py
import scrapy
import time
from anjuke.items import AnjukeItem
class AnjukeShanghaiSpider(scrapy.Spider):
name = ‘anjuke_shanghai’
allowed_domains = [‘anjuke.com’]
start_urls = [‘https://shanghai.anjuke.com/sale/p11/#filtersort’]
next_page_id = 12 def parse(self, response): for ajk in response.xpath("//ul[@id='houselist-mod-new']/li"): time.sleep(5) item = AnjukeItem() title = ajk.xpath(".//div[@class='house-title']/a/text()")[0].extract() time.sleep(1) item['title'] = title.strip() # print(item['title']) price = ajk.xpath(".//span[@class='price-det']/strong/text()")[0].extract() time.sleep(1) item['price'] = price # print(item['price']) unit_price = ajk.xpath(".//span[@class='unit-price']/text()")[0].extract() time.sleep(1) if len(unit_price) > 0: item['unit_price'] = unit_price.replace("元/m²", "") else: item['unit_price'] = "" # print(item['unit_price']) site = ajk.xpath(".//span[@class='comm-address']/text()").extract() time.sleep(1) if len(site) > 0: st = site[0].split() item['site'] = " ".join(st) else: item['site'] = "" # print(item['site']) house_type = ajk.xpath( ".//div[@class='details-item']/span[1]/text()").extract() time.sleep(1) if len(house_type) > 0: item['house_type'] = house_type[0] else: item['house_type'] = "" # print(item['house_type']) area = ajk.xpath( ".//div[@class='details-item']/span[2]/text()")[0].extract() time.sleep(1) if len(area) > 0: item['area'] = area.replace("m²", "") else: item['area'] = "" item['house_url'] = ajk.xpath(".//div[@class='house-title']/a/@href")[0].extract() time.sleep(1) # print(item['house_url']) yield item url = "https://shanghai.anjuke.com/sale/p{}/#filtersort".format( self.next_page_id) if self.next_page_id < 50: time.sleep(5) yield scrapy.Request(url=url, dont_filter=True, callback=self.parse) # print(self.page_id) self.next_page_id += 1 3. items.py
import scrapy
class AnjukeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 标题
title = scrapy.Field()
# 总价 price = scrapy.Field() # 单价 unit_price = scrapy.Field() # 地点 site = scrapy.Field() # 类型 house_type = scrapy.Field() # 面积 area = scrapy.Field() # 链接 house_url = scrapy.Field() 4. middlewares.py
from scrapy import signals
from itemadapter import is_item, ItemAdapter
class AnjukeSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, or item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request or item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
class AnjukeDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) 5. pipelines.py
from itemadapter import ItemAdapter
import pymysql
class AnjukePipeline:
def init(self):
self.connect = pymysql.connect(host=“localhost”, user=“root”, passwd=“1234”, db=“anjuke”)
self.cursor = self.connect.cursor()
print(“数据库连接成功”)
def process_item(self, item, spider): print("开始保存数据") insql = "insert into anjuke_shanghai(title,price,unit_price,site,house_type,area,house_url) values (%s,%s,%s,%s,%s,%s,%s)" self.cursor.execute(insql, ( item['title'], item['price'], item['unit_price'], item['site'], item['house_type'], item['area'], item['house_url'])) self.connect.commit() print("保存数据成功") return item def parse_close(self): self.connect.close() self.cursor.close() 6. settings.py
BOT_NAME = ‘anjuke’
SPIDER_MODULES = [‘anjuke.spiders’]
NEWSPIDER_MODULE = ‘anjuke.spiders’
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,
‘Accept-Language’: ‘en’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36’,
‘Cookie’: ‘aQQ_ajkguid=8E3DD02F-E811-A2DA-DA53-C1B88CD60608; id58=e87rkF/lNzIYHcjBD+SdAg==; _ga=GA1.2.93190540.1608857396; _gid=GA1.2.334371282.1608857396; 58tj_uuid=6fc5ade0-bfd0-4187-bd4e-9686d7082817; new_uv=1; als=0; sessid=B70FA124-E42F-8DAD-3813-6C91C72B7A20; ctid=11; twe=2; obtain_by=2; ajk_member_verify=QUbPDLTnm9FWHSOd33buoCZE2z1wm%2FVudTO6LdSsWYs%3D; ajk_member_verify2=MTYwMDA4MTUwfFUxNTU3Mjk4NzEwNDM3NXwx; xxzl_cid=7380c6b8f44840bea607d5323fb011f4; xzuid=a8fd56b1-e885-46cd-b255-5dcd8fa79dc4; ajkAuthTicket=TT=f841c95d589fd9118d083c3ba68b97a3&TS=1608895520230&PBODY=VcG9Y6AtpZbA4ERSDzm8x-gaGSpJliB6sqdOLZ5r43ZgbMtoUuIQ3_UEzjH93WSEcM1W26Q_96d7T9tcmKpasHOQN42asUK9WLXeGZ4ssbi9u2MLY5aKXbsVALuXFkdG1gu6vlvjxUMNOn_EEGoo7fk8RHanQCv-vKtjgHmzDBk&VER=2’
}
ITEM_PIPELINES = {
‘anjuke.pipelines.AnjukePipeline’: 300,
}
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
二:数据版块:
1.数据库内容:
2.使用pyecharts分析截图:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。