当前位置:   article > 正文

多线程与协程爬取使用方法,多线程threading,协程gevent和multiprocessing多进程,multiprocessing.dummy多线程_from multiprocessing.dummy import pool pool(proces

from multiprocessing.dummy import pool pool(processes

推荐使用multiprocessing多进程,multiprocessing.dummy多线程,协程gevent

multiprocessing多进程,multiprocessing.dummy多线程
def foo(x):      
    return x*x

#多进程
from multiprocessing import Pool

p = Pool(processes=5)

pp = p.map(foo,range(10))
print(pp)

#多线程
from multiprocessing.dummy import Pool as TheaderPool

p1 = TheaderPool(processes=5)

ppp = p1.map(foo,range(10))

print(ppp)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19

在这里插入图片描述
**多线程、多进程与tqdm显示进度条

#多进程
from multiprocessing import Pool
from tqdm import tqdm


def f(x):
    return x * x


if __name__ == '__main__':
    with Pool(5) as p:
        print(list((tqdm(p.imap(f, range(10)), total=10, desc='监视进度'))))

#多线程
import time
from concurrent.futures.thread import ThreadPoolExecutor
from multiprocessing.dummy import Pool as TheaderPool

from tqdm import tqdm


def func(d):
    time.sleep(1)


p11 = TheaderPool(processes=40)
r = list(tqdm(p11.imap(func, range(1, 1000)), total=1000))


  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29

#多线程与协程的使用会大大加速抓取速度,多线程算并发会产生重复和写入冲突等问题,协程会自动切换这块比较好点
分别主要是两个模块,多线程threading,协程gevent

1、多线程threading版
***多线程使用最主要就是给个线程分配内容的问题,这有些小技巧,主要就是最后几句,取模运算分配

import re
from lxml import etree
import requests
import json
import pandas as pd
import time
import csv

import threading



s=requests.Session()
cookie="++++++++++++++++++++++++++++++"
headers2 = {

        "Accept":"*/*",
        "Accept-Encoding":"gzip, deflate, br",
        "Accept-Languag":"zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Content-Type":"application/x-www-form-urlencoded",
    "Cookie":cookie,
    "Host":"weibo.com",
    "Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1",
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "X-Requested-With":"XMLHttpRequest",
}
headers1 = {
   
    "Connection":"keep-alive",
    "Cookie":cookie,
    "Host":"weibo.com",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
    "Upgrade-Insecure-Requests":"1",
}

def request1(url):

    html=s.get(url,headers=headers2)
    # print(html.text)
    json1=json.loads(html.text)['data']

    return etree.HTML(json1)

def request2(url):

    html = s.get(url,headers=headers1)
    # print(html.text)
    return html.text


lists_all=[]
# url_id="210926262"
content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist()   #OSError: Initializing from file failed 加上engine='python'
print(content_all)

def download(content):
    try:

        url_id=content[1]
        name=content[0]
        home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id)
        print(home_url)
        time.sleep(10)
        a=request2(home_url)
        # print(a)
        content_id = re.findall("page_id']='(.*?)';",a)[0]
        domain_id= re.findall("domain']='(.*?)';",a)[0]
        MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0]
        print(content_id)
        print(domain_id)
        print(MyProfileFeed_id)


        #个人简介
        username=re.findall('<title>(.*?)的微博',a)[0]  #re.findall('<h1.*?>(.*?)<',a)
        # username=aa.xpath('//h1/h')
        print(username)
        info = re.findall(',(.*?)的微博主页.*?description',a)[0]
        print(info)
        person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id)
        print(person_url)
        time.sleep(10)
        try:
            if request2(person_url):
                b=request2(person_url)
                info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","")
                print(info_html)
                info_html=etree.HTML(info_html)

                information={}
                for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))):
                    bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip()
                    try:
                        if bb == "博客:":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "个性域名:":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "标签:":
                            print("++++++++++++++++111")
                            cc = info_html.xpath('//a[@node-type="tag"]/text()')
                            print(cc)
                            # cc=dd.xpath('string(.)').strip()
                        else:
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip()
                    except:
                        pass
                    information["{}".format(bb)]=cc
                print(information)
                lists_all.append([name,username,info,information])


            with open("lists24.csv", "w", encoding="utf-8", newline="") as f:
                k = csv.writer(f, dialect="excel")
                k.writerow(["名字", "昵称", "info", "简介"])
                for list1 in lists_all:
                    k.writerow(list1)
        except:
            pass

    except:
        lists_all_set=list(set(lists_all))
        with open("lists25.csv", "w", encoding="utf-8", newline="") as f:
            k = csv.writer(f, dialect="excel")
            k.writerow(["名字", "昵称", "info", "简介"])
            for list1 in lists_all_set:
                k.writerow(list1)


#*** 多线程
if __name__ == "__main__":
    length=len(content_all)
    xclist=[[],[],[],[],[],[],[],[],[],[]] #计算分配每个线程的请求数
    N=len(xclist)
    for i in range(length):
        xclist[i%N].append(content_all[i])

    for i in range(10): #线程数量

            for m in range(len(xclist[i])):
                t=threading.Thread(target=download,args=(xclist[i][m],))
                t.start()



  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147

2、协程gevent版
***协程使用引入库这三句写最前面最上面不会出错

import gevent
import gevent.monkey

gevent.monkey.patch_all()
  • 1
  • 2
  • 3
  • 4

协程使用主要就是链接分配问题

import gevent
import gevent.monkey

gevent.monkey.patch_all()
import re
from lxml import etree
import requests
import json
import pandas as pd
import time
import csv

import threading



s=requests.Session()
cookie="++++++++"
headers2 = {
  
        "Accept":"*/*",
        "Accept-Encoding":"gzip, deflate, br",
        "Accept-Languag":"zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Content-Type":"application/x-www-form-urlencoded",
    "Cookie":cookie,
    "Host":"weibo.com",
    "Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1",
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "X-Requested-With":"XMLHttpRequest",
}
headers1 = {

    "Connection":"keep-alive",
    "Cookie":cookie,
    "Host":"weibo.com",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
    "Upgrade-Insecure-Requests":"1",
}

def request1(url):

    html=s.get(url,headers=headers2)
    # print(html.text)
    json1=json.loads(html.text)['data']

    return etree.HTML(json1)

def request2(url):

    html = s.get(url,headers=headers1)
    # print(html.text)
    return html.text


lists_all=[]
# url_id="210926262"
content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist()   #OSError: Initializing from file failed 加上engine='python'
print(content_all)

def download(content):
    try:

        url_id=content[1]
        name=content[0]
        home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id)
        print(home_url)
        time.sleep(10)
        a=request2(home_url)
        # print(a)
        content_id = re.findall("page_id']='(.*?)';",a)[0]
        domain_id= re.findall("domain']='(.*?)';",a)[0]
        MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0]
        print(content_id)
        print(domain_id)
        print(MyProfileFeed_id)


        #个人简介
        username=re.findall('<title>(.*?)的微博',a)[0]  #re.findall('<h1.*?>(.*?)<',a)
        # username=aa.xpath('//h1/h')
        print(username)
        info = re.findall(',(.*?)的微博主页.*?description',a)[0]
        print(info)
        person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id)
        print(person_url)
        time.sleep(10)
        try:
            if request2(person_url):
                b=request2(person_url)
                info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","")
                print(info_html)
                info_html=etree.HTML(info_html)

                information={}
                for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))):
                    bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip()
                    try:
                        if bb == "博客:":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "个性域名:":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "标签:":
                            print("++++++++++++++++111")
                            cc = info_html.xpath('//a[@node-type="tag"]/text()')
                            print(cc)
                            # cc=dd.xpath('string(.)').strip()
                        else:
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip()
                    except:
                        pass
                    information["{}".format(bb)]=cc
                print(information)
                lists_all.append([name,username,info,information])


            with open("lists24.csv", "w", encoding="utf-8", newline="") as f:
                k = csv.writer(f, dialect="excel")
                k.writerow(["名字", "昵称", "info", "简介"])
                for list1 in lists_all:
                    k.writerow(list1)
        except:
            pass

    except:
        lists_all_set=list(set(lists_all))
        with open("lists25.csv", "w", encoding="utf-8", newline="") as f:
            k = csv.writer(f, dialect="excel")
            k.writerow(["名字", "昵称", "info", "简介"])
            for list1 in lists_all_set:
                k.writerow(list1)

#协程版
if __name__ == "__main__":
    length=len(content_all)
    xclist = []  #构建协程链接池
    for i in range(length):
        xclist.append(gevent.spawn(download,content_all[i] ))
    print(xclist)



    gevent.joinall(xclist)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/74901
推荐阅读
相关标签
  

闽ICP备14008679号