赞
踩
def foo(x): return x*x #多进程 from multiprocessing import Pool p = Pool(processes=5) pp = p.map(foo,range(10)) print(pp) #多线程 from multiprocessing.dummy import Pool as TheaderPool p1 = TheaderPool(processes=5) ppp = p1.map(foo,range(10)) print(ppp)
**多线程、多进程与tqdm显示进度条
#多进程 from multiprocessing import Pool from tqdm import tqdm def f(x): return x * x if __name__ == '__main__': with Pool(5) as p: print(list((tqdm(p.imap(f, range(10)), total=10, desc='监视进度')))) #多线程 import time from concurrent.futures.thread import ThreadPoolExecutor from multiprocessing.dummy import Pool as TheaderPool from tqdm import tqdm def func(d): time.sleep(1) p11 = TheaderPool(processes=40) r = list(tqdm(p11.imap(func, range(1, 1000)), total=1000))
#多线程与协程的使用会大大加速抓取速度,多线程算并发会产生重复和写入冲突等问题,协程会自动切换这块比较好点
分别主要是两个模块,多线程threading,协程gevent
1、多线程threading版
***多线程使用最主要就是给个线程分配内容的问题,这有些小技巧,主要就是最后几句,取模运算分配
import re from lxml import etree import requests import json import pandas as pd import time import csv import threading s=requests.Session() cookie="++++++++++++++++++++++++++++++" headers2 = { "Accept":"*/*", "Accept-Encoding":"gzip, deflate, br", "Accept-Languag":"zh-CN,zh;q=0.9", "Connection":"keep-alive", "Content-Type":"application/x-www-form-urlencoded", "Cookie":cookie, "Host":"weibo.com", "Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1", "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "X-Requested-With":"XMLHttpRequest", } headers1 = { "Connection":"keep-alive", "Cookie":cookie, "Host":"weibo.com", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36", "Upgrade-Insecure-Requests":"1", } def request1(url): html=s.get(url,headers=headers2) # print(html.text) json1=json.loads(html.text)['data'] return etree.HTML(json1) def request2(url): html = s.get(url,headers=headers1) # print(html.text) return html.text lists_all=[] # url_id="210926262" content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist() #OSError: Initializing from file failed 加上engine='python' print(content_all) def download(content): try: url_id=content[1] name=content[0] home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id) print(home_url) time.sleep(10) a=request2(home_url) # print(a) content_id = re.findall("page_id']='(.*?)';",a)[0] domain_id= re.findall("domain']='(.*?)';",a)[0] MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0] print(content_id) print(domain_id) print(MyProfileFeed_id) #个人简介 username=re.findall('<title>(.*?)的微博',a)[0] #re.findall('<h1.*?>(.*?)<',a) # username=aa.xpath('//h1/h') print(username) info = re.findall(',(.*?)的微博主页.*?description',a)[0] print(info) person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id) print(person_url) time.sleep(10) try: if request2(person_url): b=request2(person_url) info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","") print(info_html) info_html=etree.HTML(info_html) information={} for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))): bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip() try: if bb == "博客:": cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip() elif bb == "个性域名:": cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip() elif bb == "标签:": print("++++++++++++++++111") cc = info_html.xpath('//a[@node-type="tag"]/text()') print(cc) # cc=dd.xpath('string(.)').strip() else: cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip() except: pass information["{}".format(bb)]=cc print(information) lists_all.append([name,username,info,information]) with open("lists24.csv", "w", encoding="utf-8", newline="") as f: k = csv.writer(f, dialect="excel") k.writerow(["名字", "昵称", "info", "简介"]) for list1 in lists_all: k.writerow(list1) except: pass except: lists_all_set=list(set(lists_all)) with open("lists25.csv", "w", encoding="utf-8", newline="") as f: k = csv.writer(f, dialect="excel") k.writerow(["名字", "昵称", "info", "简介"]) for list1 in lists_all_set: k.writerow(list1) #*** 多线程 if __name__ == "__main__": length=len(content_all) xclist=[[],[],[],[],[],[],[],[],[],[]] #计算分配每个线程的请求数 N=len(xclist) for i in range(length): xclist[i%N].append(content_all[i]) for i in range(10): #线程数量 for m in range(len(xclist[i])): t=threading.Thread(target=download,args=(xclist[i][m],)) t.start()
2、协程gevent版
***协程使用引入库这三句写最前面最上面不会出错
import gevent
import gevent.monkey
gevent.monkey.patch_all()
协程使用主要就是链接分配问题
import gevent import gevent.monkey gevent.monkey.patch_all() import re from lxml import etree import requests import json import pandas as pd import time import csv import threading s=requests.Session() cookie="++++++++" headers2 = { "Accept":"*/*", "Accept-Encoding":"gzip, deflate, br", "Accept-Languag":"zh-CN,zh;q=0.9", "Connection":"keep-alive", "Content-Type":"application/x-www-form-urlencoded", "Cookie":cookie, "Host":"weibo.com", "Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1", "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "X-Requested-With":"XMLHttpRequest", } headers1 = { "Connection":"keep-alive", "Cookie":cookie, "Host":"weibo.com", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36", "Upgrade-Insecure-Requests":"1", } def request1(url): html=s.get(url,headers=headers2) # print(html.text) json1=json.loads(html.text)['data'] return etree.HTML(json1) def request2(url): html = s.get(url,headers=headers1) # print(html.text) return html.text lists_all=[] # url_id="210926262" content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist() #OSError: Initializing from file failed 加上engine='python' print(content_all) def download(content): try: url_id=content[1] name=content[0] home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id) print(home_url) time.sleep(10) a=request2(home_url) # print(a) content_id = re.findall("page_id']='(.*?)';",a)[0] domain_id= re.findall("domain']='(.*?)';",a)[0] MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0] print(content_id) print(domain_id) print(MyProfileFeed_id) #个人简介 username=re.findall('<title>(.*?)的微博',a)[0] #re.findall('<h1.*?>(.*?)<',a) # username=aa.xpath('//h1/h') print(username) info = re.findall(',(.*?)的微博主页.*?description',a)[0] print(info) person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id) print(person_url) time.sleep(10) try: if request2(person_url): b=request2(person_url) info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","") print(info_html) info_html=etree.HTML(info_html) information={} for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))): bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip() try: if bb == "博客:": cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip() elif bb == "个性域名:": cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip() elif bb == "标签:": print("++++++++++++++++111") cc = info_html.xpath('//a[@node-type="tag"]/text()') print(cc) # cc=dd.xpath('string(.)').strip() else: cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip() except: pass information["{}".format(bb)]=cc print(information) lists_all.append([name,username,info,information]) with open("lists24.csv", "w", encoding="utf-8", newline="") as f: k = csv.writer(f, dialect="excel") k.writerow(["名字", "昵称", "info", "简介"]) for list1 in lists_all: k.writerow(list1) except: pass except: lists_all_set=list(set(lists_all)) with open("lists25.csv", "w", encoding="utf-8", newline="") as f: k = csv.writer(f, dialect="excel") k.writerow(["名字", "昵称", "info", "简介"]) for list1 in lists_all_set: k.writerow(list1) #协程版 if __name__ == "__main__": length=len(content_all) xclist = [] #构建协程链接池 for i in range(length): xclist.append(gevent.spawn(download,content_all[i] )) print(xclist) gevent.joinall(xclist)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。