赞
踩
- encoding=utf8
-
- import requests
- from requests.exceptions import RequestException
- import urllib
- import json
- import re
- from bs4 import BeautifulSoup
- from config import *
- def get_index(offset,keyword):
- data={
- 'offset':offset,
- 'format':'json',
- 'keyword':keyword,
- 'autoload':'true',
- 'count':20,
- 'cur_tab':3
- }
- url='http://www.toutiao.com/search_content/?'+urllib.urlencode(data)
- response=requests.get(url)
- try:
- if response.status_code == 200:
- return response.text
- return None
- except RequestException:
- print u'请求索引页出错'
- return None
- def parse_page_index(html):
- #将json格式的字符串转化成python对象,对象转换成json用 json.dumps()
- data=json.loads(html)
- if data and 'data' in data.keys():
- for item in data.get('data'):
- #yield 是url生成器 即取出article_url并生成url
- yield item.get('article_url')
- def get_page_detail(url):
- response = requests.get(url)
- try:
- if response.status_code == 200:
- return response.text
- return None
- except RequestException:
- print '请求详情页出错'
- return None
- def parse_page_detail(html,url):
- soup=BeautifulSoup(html,'lxml')
- title = soup.select('title')[0].get_text()
- images_pattern= re.compile('var gallery = (.*?);',re.S)
- result = re.search(images_pattern,html)
- if result:
- data =json.loads(result.group(1))
- sub_images = data.get('sub_images')
- images = [item.get('url') for item in sub_images]
- return {
- 'title' :title,
- 'url':url,
- 'images':images
- }
- def main():
- html=get_index(0,'街拍')
- for url in parse_page_index(html):
- html=get_page_detail(url)
- if html:
- result=parse_page_detail(html,url)
- print result['title']
- if name == 'main':
- main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。