赞
踩
#_*_ codeing: utf-8 _*_ import json import os import requests from requests.exceptions import RequestException import re from bs4 import BeautifulSoup import random import urllib import urllib.request from multiprocessing import Pool def get_context_from_url(url): my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" ] randdom_header = random.choice(my_headers) req = urllib.request.Request(url) req.add_header("User-Agent", randdom_header) req.add_header("GET", url) response = urllib.request.urlopen(req) context = response.read().decode('utf-8') return context def parse_html_by_re(content): pattern = re.compile('<dd>.*?board-index.*?>([\d+])</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>' '.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) items = re.findall(pattern, content) #print(items) for item in items: yield{ 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], 'time': item[4].strip()[5:], 'score': item[5] + item[6] } def write_html_context_to_file(content): with open('d:\python\movie.txt', 'a', encoding='utf-8') as fp: fp.write(json.dumps(content, ensure_ascii=False) + '\n')##将字典内容转换成字符串 fp.close() def main(offset): url = 'https://maoyan.com/board/4?offset=' + str(offset) #for i in range() content = get_context_from_url(url) for item in parse_html_by_re(content): write_html_context_to_file(item) if __name__ == '__main__': if os.access('d:\python\movie.txt', os.F_OK): os.remove('d:\python\movie.txt') #for i in range(10): #main(i*10) pool = Pool()###创建进程池,提升速率 pool.map(main, [i*10 for i in range(10)])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。