赞
踩
一、爬取知乎热门内容
- # -*- coding: utf-8-*-
- import urllib2
- import re
- from BeautifulSoup import BeautifulSoup
- import sys
- reload(sys)
- sys.setdefaultencoding('utf8')
-
- f = open('howtoTucao2.txt', 'w') # open the file
-
- for pagenum in range(1, 21):
-
- strpagenum = str(pagenum)
- print "Getting data for Page " + strpagenum # for we can see the process in shell
- url = "http://www.zhihu.com/collection/27109279?page=" + strpagenum
- page = urllib2.urlopen(url) # get the web page
- soup = BeautifulSoup(page) # use BeautifulSoup to parsing the web page
-
- ALL = soup.findAll(attrs={'class': ['zm-item-title', 'content hidden']})
-
- for each in ALL:
- if each.name == 'h2':
- nowstring = re.sub('<s.+>\n<a.+>\n<.+>\n', '', each.a.string)
- nowstring = re.sub('<br>', '\n', nowstring)
- nowstring = re.sub('<\w+>', '', nowstring)
- nowstring = re.sub('</\w+>', '', nowstring)
- nowstring = re.sub('<.+>', '\n图片\n', nowstring)
- nowstring = re.sub('"', '"', nowstring)
- print nowstring
- if nowstring:
- f.write(nowstring)
- else:
- f.write("\n No Answer \n")
- else:
- nowstring = re.sub('<s.+>\n<a.+>\n<.+>\n', '', each.string)
- nowstring = re.sub('<br>', '\n', nowstring)
- nowstring = re.sub('<\w+>', '', nowstring)
- nowstring = re.sub('</\w+>', '', nowstring)
- nowstring = re.sub('<.+>', '\n图片\n', nowstring)
- nowstring = re.sub('"', '"', nowstring)
- print nowstring
- if nowstring:
- f.write(nowstring)
- else:
- f.write("\n No Answer \n")
- f.close() # close the file

二、爬取简书内容(基于Scrapy框架)
(1)item.py
<Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。