当前位置:   article > 正文

scrapy爬虫+echarts数据分析(安居客)_安居客robots.txt

安居客robots.txt
本次爬取是安居客的房产信息:话不多说,直接上代码!
  • 1

一:爬虫板块:

1.运行文件:run.py
  • 1

from scrapy import cmdline

cmdline.execute(‘scrapy crawl anjuke_shanghai’.split())

    2.网页解析:anjuke_shanghai.py
  • 1

import scrapy
import time
from anjuke.items import AnjukeItem

class AnjukeShanghaiSpider(scrapy.Spider):
name = ‘anjuke_shanghai’
allowed_domains = [‘anjuke.com’]
start_urls = [‘https://shanghai.anjuke.com/sale/p11/#filtersort’]

next_page_id = 12
def parse(self, response):
    for ajk in response.xpath("//ul[@id='houselist-mod-new']/li"):
        time.sleep(5)
        item = AnjukeItem()

        title = ajk.xpath(".//div[@class='house-title']/a/text()")[0].extract()
        time.sleep(1)
        item['title'] = title.strip()
        # print(item['title'])
        price = ajk.xpath(".//span[@class='price-det']/strong/text()")[0].extract()
        time.sleep(1)
        item['price'] = price
        # print(item['price'])
        unit_price = ajk.xpath(".//span[@class='unit-price']/text()")[0].extract()
        time.sleep(1)
        if len(unit_price) > 0:
            item['unit_price'] = unit_price.replace("元/m²", "")
        else:
            item['unit_price'] = ""
        # print(item['unit_price'])
        site = ajk.xpath(".//span[@class='comm-address']/text()").extract()
        time.sleep(1)
        if len(site) > 0:
            st = site[0].split()
            item['site'] = " ".join(st)
        else:
            item['site'] = ""
        # print(item['site'])
        house_type = ajk.xpath(
            ".//div[@class='details-item']/span[1]/text()").extract()
        time.sleep(1)
        if len(house_type) > 0:
            item['house_type'] = house_type[0]
        else:
            item['house_type'] = ""
        # print(item['house_type'])
        area = ajk.xpath(
            ".//div[@class='details-item']/span[2]/text()")[0].extract()
        time.sleep(1)
        if len(area) > 0:
            item['area'] = area.replace("m²", "")
        else:
            item['area'] = ""
        item['house_url'] = ajk.xpath(".//div[@class='house-title']/a/@href")[0].extract()
        time.sleep(1)
        # print(item['house_url'])
        yield item
    url = "https://shanghai.anjuke.com/sale/p{}/#filtersort".format(
        self.next_page_id)

    if self.next_page_id < 50:
        time.sleep(5)

        yield scrapy.Request(url=url, dont_filter=True, callback=self.parse)
        # print(self.page_id)
        self.next_page_id += 1
 



3.  items.py
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62

Define here the models for your scraped items

See documentation in:

https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class AnjukeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 标题
title = scrapy.Field()

# 总价
price = scrapy.Field()

# 单价
unit_price = scrapy.Field()

# 地点
site = scrapy.Field()

# 类型
house_type = scrapy.Field()

# 面积
area = scrapy.Field()

# 链接
house_url = scrapy.Field()






    4. middlewares.py
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24

Define here the models for your spider middleware

See documentation in:

https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

useful for handling different item types with a single interface

from itemadapter import is_item, ItemAdapter

class AnjukeSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
    # Called for each response that goes through the spider
    # middleware and into the spider.
    # Should return None or raise an exception.
    return None
def process_spider_output(self, response, result, spider):
    # Called with the results returned from the Spider, after
    # it has processed the response.
    # Must return an iterable of Request, or item objects.
    for i in result:
        yield i

def process_spider_exception(self, response, exception, spider):
    # Called when a spider or process_spider_input() method
    # (from other spider middleware) raises an exception.
    # Should return either None or an iterable of Request or item objects.
    pass
def process_start_requests(self, start_requests, spider):
    # Called with the start requests of the spider, and works
    # similarly to the process_spider_output() method, except
    # that it doesn’t have a response associated.
    # Must return only requests (not items).
    for r in start_requests:
        yield r

def spider_opened(self, spider):
    spider.logger.info('Spider opened: %s' % spider.name)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27

class AnjukeDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):
    # Called for each request that goes through the downloader
    # middleware.
    # Must either:
    # - return None: continue processing this request
    # - or return a Response object
    # - or return a Request object
    # - or raise IgnoreRequest: process_exception() methods of
    #   installed downloader middleware will be called
    return None
def process_response(self, request, response, spider):
    # Called with the response returned from the downloader.
    # Must either;
    # - return a Response object
    # - return a Request object
    # - or raise IgnoreRequest
    return response

def process_exception(self, request, exception, spider):
    # Called when a download handler or a process_request()
    # (from other downloader middleware) raises an exception.
    # Must either:
    # - return None: continue processing this exception
    # - return a Response object: stops process_exception() chain
    # - return a Request object: stops process_exception() chain
    pass
def spider_opened(self, spider):
    spider.logger.info('Spider opened: %s' % spider.name)


    5. pipelines.py
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

from itemadapter import ItemAdapter
import pymysql

class AnjukePipeline:
def init(self):
self.connect = pymysql.connect(host=“localhost”, user=“root”, passwd=“1234”, db=“anjuke”)
self.cursor = self.connect.cursor()
print(“数据库连接成功”)

def process_item(self, item, spider):
    print("开始保存数据")

    insql = "insert into anjuke_shanghai(title,price,unit_price,site,house_type,area,house_url) values (%s,%s,%s,%s,%s,%s,%s)"
    self.cursor.execute(insql, (
        item['title'], item['price'], item['unit_price'], item['site'], item['house_type'],
        item['area'], item['house_url']))

    self.connect.commit()

    print("保存数据成功")

    return item

def parse_close(self):
    self.connect.close()
    self.cursor.close()


   6. settings.py
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

Scrapy settings for anjuke project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

https://docs.scrapy.org/en/latest/topics/settings.html

https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = ‘anjuke’
SPIDER_MODULES = [‘anjuke.spiders’]
NEWSPIDER_MODULE = ‘anjuke.spiders’

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘anjuke (+http://www.yourdomain.com)’

Obey robots.txt rules

ROBOTSTXT_OBEY = False

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

See also autothrottle settings and docs

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

COOKIES_ENABLED = True

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,
‘Accept-Language’: ‘en’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36’,
‘Cookie’: ‘aQQ_ajkguid=8E3DD02F-E811-A2DA-DA53-C1B88CD60608; id58=e87rkF/lNzIYHcjBD+SdAg==; _ga=GA1.2.93190540.1608857396; _gid=GA1.2.334371282.1608857396; 58tj_uuid=6fc5ade0-bfd0-4187-bd4e-9686d7082817; new_uv=1; als=0; sessid=B70FA124-E42F-8DAD-3813-6C91C72B7A20; ctid=11; twe=2; obtain_by=2; ajk_member_verify=QUbPDLTnm9FWHSOd33buoCZE2z1wm%2FVudTO6LdSsWYs%3D; ajk_member_verify2=MTYwMDA4MTUwfFUxNTU3Mjk4NzEwNDM3NXwx; xxzl_cid=7380c6b8f44840bea607d5323fb011f4; xzuid=a8fd56b1-e885-46cd-b255-5dcd8fa79dc4; ajkAuthTicket=TT=f841c95d589fd9118d083c3ba68b97a3&TS=1608895520230&PBODY=VcG9Y6AtpZbA4ERSDzm8x-gaGSpJliB6sqdOLZ5r43ZgbMtoUuIQ3_UEzjH93WSEcM1W26Q_96d7T9tcmKpasHOQN42asUK9WLXeGZ4ssbi9u2MLY5aKXbsVALuXFkdG1gu6vlvjxUMNOn_EEGoo7fk8RHanQCv-vKtjgHmzDBk&VER=2’
}

Enable or disable spider middlewares

See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeSpiderMiddleware’: 543,

}

Enable or disable downloader middlewares

See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

‘anjuke.middlewares.AnjukeDownloaderMiddleware’: 543,

}

Enable or disable extensions

See https://docs.scrapy.org/en/latest/topics/extensions.html

EXTENSIONS = {

‘scrapy.extensions.telnet.TelnetConsole’: None,

}

Configure item pipelines

See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
‘anjuke.pipelines.AnjukePipeline’: 300,
}

Enable and configure the AutoThrottle extension (disabled by default)

See https://docs.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = ‘httpcache’

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’

二:数据版块:

    1.数据库内容:

   





    2.使用pyecharts分析截图:
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/208119
推荐阅读
相关标签
  

闽ICP备14008679号