当前位置:   article > 正文

python爬虫源码怎么使用_如何用python实现一个高自由度爬虫

python爬虫源码怎么使用_如何用python实现一个高自由度爬虫

本爬虫有以下几点:

可以爬取一个上亿页面的站点,可以多开,暂停,继续

可以自定义爬行路径,爬行延时

能够自动发现新链接,加入队列

以下为代码:

爬虫配置文件,cnblogs_com.py,可以配置开始页面,爬行页面,保存页面,头信息,延迟,超时时间。

# coding=utf-8

start_urls = [

'http://www.cnblogs.com/',

'http://news.cnblogs.com/',

'http://q.cnblogs.com/',

'http://home.cnblogs.com/blog/all/',

]

find_urls = [

r'^http://news\.cnblogs\.com/n/\d+/$',

r'^http://q.cnblogs.com/q/\d+/$',

r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/p/\d+.html$',

r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/archive/\d+/\d+/\d+/\d+.html$',

r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/$',

r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/default\.html\?page=\d+$',

r'^http://q\.cnblogs\.com/tag/',

]

save_urls = [

r'^http://news\.cnblogs\.com/n/\d+/$',

r'^http://q.cnblogs.com/q/\d+/$',

r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/p/\d+.html$',

r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/archive/\d+/\d+/\d+/\d+.html$',

]

headers = {

"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36",

"Referer": "http://www.hisearch.cn/",

}

delay = 2

timeout = 5

爬虫开始命令行,可以运行 python spider.py -s cnblogs_com start 开始爬取一个站点,也可多开,中间进程挂了下次可以继续爬

#!/usr/bin/python

# coding=utf-8

import argparse

from lib.Spider import Spider

allow_commands = ['start', 'clean']

if __name__ == '__main__':

# 解析参数

parser = argparse.ArgumentParser(description='General crawler')

parser.add_argument('-s', '--site', help='site config file name', required=True)

parser.add_argument('command', help='|'.join(allow_commands), type=str)

args = parser.parse_args()

command = args.command

# 执行程序

s = Spider(args.site)

if command == 'start':

s.start()

elif command == 'clean':

s.clean()

elif command == 'restart':

s.restart()

else:

print('%s is not in a valid command, allowed: %s' % (command, '|'.join(allow_commands)))

爬虫类: Spider.py 主要采用了leveldb来存储爬取的数据,leveldb能够对抓取的数据进行压缩,用redis做队列,redis的hyperloglog数据格式能够用非常少的内存来做url去重

# coding=utf-8

"""

爬虫类

"""

import time

import sys

import traceback

import logging

from logging.handlers import TimedRotatingFileHandler

import re

import redis

import uuid

import requests

from conf import settings

import leveldb

from lxml.html import fromstring

class Spider(object):

site = None

config = None

que = None

log = None

db = None

request = None

def __init__(self, site):

self.site = site

self.load_config()

self.que = redis.from_url(settings.REDIS_URI)

self.log = self.get_logger()

self.db = leveldb.LevelDB(settings.DATA_DIR + self.site, max_open_files=30)

self.request = requests.session()

def start(self):

if not self.is_started():

self.que.sadd('running_sites', self.site)

for url in self.config.start_urls:

self.que.pfadd(self.site + '_all', url)

self.que.lpush(self.site + '_in', url)

self.run()

def run(self):

while not self.que_is_empty():

url = self.que.rpop(self.site + '_in').decode()

html = self.get_page(url)

if html is not None:

data = self.get_data(html, url)

if data:

self.store_data(url, data)

self.find_more_links(html, url)

time.sleep(self.config.delay)

self.finish()

def que_is_empty(self):

if self.que.llen(self.site + '_in') == 0:

return True

else:

return False

def load_config(self):

self.config = __import__('conf.sites.' + self.site, fromlist=['conf.sites.' + self.site])

def is_started(self):

if self.que.sismember('running_sites', self.site):

self.log.info("%s is started yet." % (self.site))

return True

else:

self.log.info("%s is not start." % (self.site))

return False

def get_page(self, url):

html = None

try:

r = self.request.get(url, headers=self.config.headers, timeout=self.config.timeout)

if r.ok:

html = r.text

r.close()

self.log.debug("page_download: " + url)

except:

exc_type, exc_value, exc_traceback = sys.exc_info()

self.log.exception("download_error: " + url + ", " + str(exc_value),

exc_info=traceback.format_tb(exc_traceback))

return html

def get_data(self, html, url):

for regxp in self.config.save_urls:

if re.compile(regxp).match(url):

return html

return False

def store_data(self, url, data):

self.db.Put(url.encode(), data.encode())

self.log.debug("page_saved: %s" % url)

def find_more_links(self, html, url):

try:

page = fromstring(html, url)

page.make_links_absolute(url)

for element, attribute, link, pos in page.iterlinks():

for regxp in self.config.find_urls:

if re.compile(regxp).match(link):

self.add_url(link)

break

except:

exc_type, exc_value, exc_traceback = sys.exc_info()

self.log.exception("find_more_links_error: " + url + ", " + str(exc_value),

exc_info=traceback.format_tb(exc_traceback))

def add_url(self, url):

if self.que.pfadd(self.site + '_all', url) == 1:

key = url.encode()

if key not in self.db.RangeIter(include_value=False, key_from=key, key_to=key):

self.que.lpush(self.site + '_in', url)

self.log.debug("page_found: " + url)

else:

self.log.debug("page_exist: " + url)

def finish(self):

self.que.srem('running_sites', self.site)

self.que.delete(self.site + '_all')

self.log.info('finished')

def clean(self):

self.que.srem('running_sites', self.site)

self.que.delete(self.site + '_all')

self.que.delete(self.site + '_in')

self.log.info('cleaned')

def restart(self):

self.clean()

self.start()

def get_logger(self):

logger = logging.getLogger('spider.' + self.site)

hd = TimedRotatingFileHandler(settings.LOG_DIR + self.site + '.log', when='D', backupCount=30)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

hd.setFormatter(formatter)

logger.addHandler(hd)

logger.setLevel(logging.DEBUG)

return logger

def get_doc_id_by_url(self, url):

return str(uuid.uuid5(uuid.NAMESPACE_URL, url))

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop】
推荐阅读
相关标签
  

闽ICP备14008679号