当前位置:   article > 正文

python_爬虫今日头条

python_爬虫今日头条
  1. from urllib.parse import urlencode
  2. from requests.exceptions import RequestException
  3. import requests
  4. import json
  5. import re
  6. from bs4 import BeautifulSoup
  7. from config import *
  8. import pymongo
  9. client = pymongo.MongoClient(MONGO_URL)
  10. db = client(MONGO_DB)
  11. def save_to_mogo(result):
  12. if db[MONGO_TABLE].insert(result):
  13. print("储存成功",result)
  14. return True
  15. return False
  16. def get_page_index(offset,keyword):
  17. print(1)
  18. data={
  19. 'offset': offset,
  20. 'format': 'json',
  21. 'keyword': keyword,
  22. 'autoload': 'true',
  23. 'count': '20',
  24. 'cur_tab': 1,
  25. }
  26. url ='https://www.toutiao.com/search_content/?'+urlencode(data)
  27. r=requests.get(url)
  28. try:
  29. if r.status_code ==200:
  30. r.encoding = r.apparent_encoding
  31. return r.text
  32. return None
  33. except RequestException:
  34. print("请求页面错误")
  35. return None
  36. def parse_page_index(html):
  37. data =json.loads(html)
  38. if data and "data" in data.keys():
  39. for item in data.get("data"):
  40. yield item.get("article_url")
  41. def get_pic_url(url):
  42. try:
  43. r=requests.get(url)
  44. if r.status_code==200:
  45. return r.text
  46. return None
  47. except RequestException:
  48. return None
  49. def get_pic(html,url):
  50. soup=BeautifulSoup(html,'html.parser')
  51. title=soup.select('title')[0].get_text()
  52. images_pattern = re.compile('gallery = (.*?);',re.S)
  53. result = re.search(images_pattern,html)
  54. if result:
  55. data = json.loads(result.group(1))
  56. if data and 'sub_images' in data.keys():
  57. sub_images = data.get('sub_images')
  58. images=[item.get('url') for item in sub_images]
  59. return {
  60. 'title':title,
  61. 'images':images,
  62. 'url':url
  63. }
  64. def main():
  65. html= get_page_index(0, "街拍")
  66. parse_page = parse_page_index(html)
  67. for url in parse_page:
  68. html =get_pic_url(url)
  69. if html:
  70. if get_pic(html,url) is not None:
  71. result = get_pic(html,url)
  72. save_to_mogo(result)
  73. if __name__ == '__main__':
  74. main()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/正经夜光杯/article/detail/958730
推荐阅读
相关标签
  

闽ICP备14008679号