赞
踩
该项目基于Scrapy-Redis框架实现分布式爬虫。其中,我使用了自身电脑(win10)作为redis服务器, WSL虚拟机和一台mac作为爬虫服务器,从而实现分布式爬虫。
pip freeze -r requirements
pip install -r requirements.txt
关于房天下的爬虫网络上有很多,但是实际运行后,会有一些问题。比如,目前房天下的城市房源页面加入了广告页,这是爬取过程中不需要的数据,如果不处理,会直接报错,停止爬虫运行。
因此,该项目中针对运行过程中遇到的问题进行了代码的优化。
REDIS_URL=‘redis://:password@ip:port’
scrapy runspider 爬虫名
不再是scrapy crawl 爬虫名
redis-cli> lpush [redis_key] start_url
开始爬取
NOTE:
Unhandled error in Deferred
此时,应进入.log文档内查看错误
具体的项目可以在我的github上查看,欢迎指正
下面展示几个主要的代码
# -*- coding: utf-8 -*- import scrapy import re from fangtianxia_scrapy.items import NewHouseItem, EsfHouseItem from scrapy_redis.spiders import RedisSpider class FangSpider(RedisSpider): name = 'fang' allowed_domains = ['fang.com'] # start_urls = ['https://www.fang.com/SoufunFamily.htm'] redis_key = 'fang:start_urls' def parse(self, response): trs = response.xpath('//div[@class="outCont"]//tr') province = None for tr in trs: tds = tr.xpath('.//td[not(@class)]') province_td = tds[0] province_text = province_td.xpath('.//text()').get() province_text = re.sub(r'\s', '', province_text) if province_text: province = province_text # 不爬取海外城市的房源 if province == '其它': continue city_td = tds[1] city_links = city_td.xpath('.//a') for city_link in city_links: city = city_link.xpath('.//text()').get() # 台湾url地址与其他不同 if '台湾' in city: continue # 北京二手房页面会重定向至本地,还未解决该问题 if '北京' in city: continue city_url = city_link.xpath('.//@href').get() city_text = re.findall('.*//(.*).*.fang', city_url)[0] # 构建新房的url链接 newhouse_url = 'https://' + city_text + '.newhouse.fang.com/house/s/' # 构建二手房的url链接 esf_url = 'https://' + city_text + '.esf.fang.com' yield scrapy.Request( url=newhouse_url, callback=self.parse_newhouse, meta={'info': (province, city)} ) yield scrapy.Request( url=esf_url, callback=self.parse_esf, meta={'info': (province, city)} ) def parse_newhouse(self, response): # 新房 province, city = response.meta.get('info') lis = response.xpath('//div[contains(@class,"nl_con clearfix")]/ul/li') for li in lis: name_text = li.xpath('.//div[@class="nlcd_name"]/a/text()').get() name = name_text.strip() # 页面中插入了广告页li,需要剔除 if name: house_type_list = li.xpath('.//div[contains(@class, "house_type")]/a/text()').getall() house_type_list = list(map(lambda x: re.sub(r'/s', '', x), house_type_list)) house_type = ','.join(list(filter(lambda x: x.endswith('居'), house_type_list))) area_text = ''.join(li.xpath('.//div[contains(@class, "house_type")]/text()').getall()) area = re.sub(r'\s|-|/', '', area_text) address = li.xpath('.//div[@class="address"]/a/@title').get() district_text = ''.join(li.xpath('.//div[@class="address"]/a//text()').getall()) try: district = re.search(r'.*\[(.+)\].*', district_text).group(1) except: district = 'None' sale = li.xpath('.//div[contains(@class, "fangyuan")]/span/text()').get() price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()) price = re.sub(r"\s|广告", "", price) detail_url_text = li.xpath('.//div[@class="nlc_img"]/a/@href').get() detail_url = response.urljoin(detail_url_text) item = NewHouseItem(province=province, city=city, name=name, house_type=house_type, area=area, address=address, district=district, sale=sale, price=price, detail_url=detail_url) yield item next_url = response.xpath('//div[@class="page"]//a[class="next"]/@href').get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={'info': (province, city)}) def parse_esf(self, response): # 二手房 province, city = response.meta.get('info') dls = response.xpath('//div[@class="shop_list shop_list_4"]/dl') for dl in dls: item = EsfHouseItem(province=province, city=city) name = dl.xpath('.//span[@class="tit_shop"]/text()').get() # 页面中插入了广告页li,需要剔除 if name: infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall() infos = list(map(lambda x: re.sub(r"\s", "", x), infos)) for info in infos: if "厅" in info: item["house_type"] = info elif '㎡' in info: item["area"] = info elif '层' in info: item["floor"] = info elif '向' in info: item["orientation"] = info elif '年建' in info: item["year"] = re.sub("年建", "", info) item["address"] = dl.xpath('.//p[@class="add_shop"]/span/text()').get() item["total_price"] = "".join(dl.xpath(".//span[@class='red']//text()").getall()) item["unit_price"] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get() item["detail_url"] = response.urljoin(dl.xpath(".//h4[@class='clearfix']/a/@href").get()) item["name"] = name # 以下五个字段大概率会缺失,存入mysql会报错,因此加入判断 if 'house_type' not in item: item["house_type"] = '/' elif 'area' not in item: item["area"] = '/' elif 'floor' not in item: item["floor"] = '/' elif 'orientation' not in item: item["orientation"] = '/' elif 'year' not in item: item["year"] = '/' yield item next_url = response.xpath('//div[@class="page_al"]/p/a/@href').get() if next_url: yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={'info': (province, city)})
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class NewHouseItem(scrapy.Item): province = scrapy.Field() city = scrapy.Field() name = scrapy.Field() house_type = scrapy.Field() area = scrapy.Field() address = scrapy.Field() district = scrapy.Field() sale = scrapy.Field() price = scrapy.Field() detail_url = scrapy.Field() class EsfHouseItem(scrapy.Item): province = scrapy.Field() city = scrapy.Field() name = scrapy.Field() house_type = scrapy.Field() area = scrapy.Field() floor = scrapy.Field() orientation = scrapy.Field() year = scrapy.Field() address = scrapy.Field() total_price = scrapy.Field() unit_price = scrapy.Field() detail_url = scrapy.Field()
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware import random class RotateUserAgentMiddleware(UserAgentMiddleware): # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: # 显示当前使用的useragent # print("********Current UserAgent:%s************" % ua) # 记录 spider.logger.info('Current UserAgent: ' + ua) request.headers['User-Agent'] = ua
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exporters import JsonLinesItemExporter from fangtianxia_scrapy.items import NewHouseItem, EsfHouseItem from twisted.enterprise import adbapi import pymysql class FangTianXiaScrapyPipeline(object): def __init__(self): self.newhouse_fp = open('newhouse.json', 'ab') self.esfhouse_fp = open('esfhouse.json', 'ab') self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False) self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp, ensure_ascii=False) def process_item(self, item, spider): if isinstance(item, NewHouseItem): self.newhouse_exporter.export_item(item) elif isinstance(item, EsfHouseItem): self.esfhouse_exporter.export_item(item) return item def close_spider(self, spider): self.newhouse_fp.close() self.esfhouse_fp.close() class MysqlTwistedPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): db_params = dict( host=settings['MYSQL_HOST'], database=settings['MYSQL_database'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASSWORD'], port=settings['MYSQL_PORT'], charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.DictCursor ) dbpool = adbapi.ConnectionPool('pymysql', **db_params) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) return item def handle_error(self, failure, item, spider): print(failure) def do_insert(self, cursor, item): if isinstance(item, NewHouseItem): insert_sql = """insert into fangtianxia.newhouse(province, city, name, house_type, area, address, district, sale, price, detail_url) Values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""" cursor.execute(insert_sql, ( item['province'], item['city'], item['name'], item['house_type'], item['area'], item['address'], item['district'], item['sale'], item['price'], item['detail_url'])) elif isinstance(item, EsfHouseItem): insert_sql = """insert into fangtianxia.esfhouse(province, city, name, house_type, area, floor, orientation, year, address, total_price, unit_price, detail_url) Values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""" cursor.execute(insert_sql, ( item['province'], item['city'], item['name'], item['house_type'], item['area'], item['floor'], item['orientation'], item['year'], item['address'], item['total_price'], item['unit_price'], item['detail_url']))
# -*- coding: utf-8 -*- # Scrapy settings for fangtianxia_scrapy project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'fangtianxia_scrapy' SPIDER_MODULES = ['fangtianxia_scrapy.spiders'] NEWSPIDER_MODULE = 'fangtianxia_scrapy.spiders' MYSQL_USER = 'root' MYSQL_PASSWORD = 'password' MYSQL_DATABASE = 'fangtianxia' MYSQL_HOST = 'localhost' MYSQL_PORT = 3306 # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'fangtianxia_scrapy (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_FILE = 'scrapy.log' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'fangtianxia_scrapy.middlewares.FangtianxiaScrapySpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'fangtianxia_scrapy.middlewares.FangtianxiaScrapyDownloaderMiddleware': 543, 'fangtianxia_scrapy.middlewares.RotateUserAgentMiddleware': 543, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html # ITEM_PIPELINES = { # 'fangtianxia_scrapy.pipelines.FangTianXiaScrapyPipeline': 300, # 'fangtianxia_scrapy.pipelines.MysqlTwistedPipeline': 300, # } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # Scrapy-Redis相关配置 # 确保request存储到redis中 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 确保所有爬虫共享相同的去重指纹 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 设置redis为item pipeline ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 300, 'fangtianxia_scrapy.pipelines.MysqlTwistedPipeline': 300, } # 在redis中保持scrapy-redis用到的队列,不会清理redis中的队列,从而可以实现暂停和恢复的功能。 SCHEDULER_PERSIST = True # 设置连接redis信息 REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379 REDIS_PASSWORD = 'password'
分布式爬虫的速度非常快,而且即使一个slave因为反爬等问题挂掉了,也不影响整体运行。后续会尝试加入BloomFilter等方式优化爬虫。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。