赞
踩
- from urllib.parse import urlencode
- from requests.exceptions import RequestException
- import requests
- import json
- import re
- from bs4 import BeautifulSoup
- from config import *
- import pymongo
- client = pymongo.MongoClient(MONGO_URL)
- db = client(MONGO_DB)
-
- def save_to_mogo(result):
- if db[MONGO_TABLE].insert(result):
- print("储存成功",result)
- return True
- return False
- def get_page_index(offset,keyword):
- print(1)
- data={
- 'offset': offset,
- 'format': 'json',
- 'keyword': keyword,
- 'autoload': 'true',
- 'count': '20',
- 'cur_tab': 1,
- }
- url ='https://www.toutiao.com/search_content/?'+urlencode(data)
-
- r=requests.get(url)
- try:
- if r.status_code ==200:
- r.encoding = r.apparent_encoding
- return r.text
- return None
- except RequestException:
- print("请求页面错误")
- return None
- def parse_page_index(html):
- data =json.loads(html)
- if data and "data" in data.keys():
- for item in data.get("data"):
- yield item.get("article_url")
-
- def get_pic_url(url):
- try:
- r=requests.get(url)
- if r.status_code==200:
- return r.text
- return None
- except RequestException:
- return None
- def get_pic(html,url):
- soup=BeautifulSoup(html,'html.parser')
- title=soup.select('title')[0].get_text()
-
- images_pattern = re.compile('gallery = (.*?);',re.S)
- result = re.search(images_pattern,html)
- if result:
- data = json.loads(result.group(1))
- if data and 'sub_images' in data.keys():
- sub_images = data.get('sub_images')
- images=[item.get('url') for item in sub_images]
- return {
- 'title':title,
- 'images':images,
- 'url':url
- }
-
- def main():
- html= get_page_index(0, "街拍")
- parse_page = parse_page_index(html)
- for url in parse_page:
- html =get_pic_url(url)
- if html:
- if get_pic(html,url) is not None:
- result = get_pic(html,url)
- save_to_mogo(result)
-
- if __name__ == '__main__':
- main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。