当前位置:   article > 正文

Python今日头条网爬虫_python爬虫开发之使用python爬虫库requests,urllib与今日头条搜索功能爬取搜索

python爬虫开发之使用python爬虫库requests,urllib与今日头条搜索功能爬取搜索内
  1. encoding=utf8
  2. import requests
  3. from requests.exceptions import RequestException
  4. import urllib
  5. import json
  6. import re
  7. from bs4 import BeautifulSoup
  8. from config import *
  9. def get_index(offset,keyword):
  10. data={
  11. 'offset':offset,
  12. 'format':'json',
  13. 'keyword':keyword,
  14. 'autoload':'true',
  15. 'count':20,
  16. 'cur_tab':3
  17. }
  18. url='http://www.toutiao.com/search_content/?'+urllib.urlencode(data)
  19. response=requests.get(url)
  20. try:
  21. if response.status_code == 200:
  22. return response.text
  23. return None
  24. except RequestException:
  25. print u'请求索引页出错'
  26. return None
  27. def parse_page_index(html):
  28. #将json格式的字符串转化成python对象,对象转换成json用 json.dumps()
  29. data=json.loads(html)
  30. if data and 'data' in data.keys():
  31. for item in data.get('data'):
  32. #yield 是url生成器 即取出article_url并生成url
  33. yield item.get('article_url')
  34. def get_page_detail(url):
  35. response = requests.get(url)
  36. try:
  37. if response.status_code == 200:
  38. return response.text
  39. return None
  40. except RequestException:
  41. print '请求详情页出错'
  42. return None
  43. def parse_page_detail(html,url):
  44. soup=BeautifulSoup(html,'lxml')
  45. title = soup.select('title')[0].get_text()
  46. images_pattern= re.compile('var gallery = (.*?);',re.S)
  47. result = re.search(images_pattern,html)
  48. if result:
  49. data =json.loads(result.group(1))
  50. sub_images = data.get('sub_images')
  51. images = [item.get('url') for item in sub_images]
  52. return {
  53. 'title' :title,
  54. 'url':url,
  55. 'images':images
  56. }
  57. def main():
  58. html=get_index(0,'街拍')
  59. for url in parse_page_index(html):
  60. html=get_page_detail(url)
  61. if html:
  62. result=parse_page_detail(html,url)
  63. print result['title']
  64. if name == 'main':
  65. main()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/958733
推荐阅读
相关标签
  

闽ICP备14008679号