赞
踩
将之前所做的爬虫案例放出,方便查阅,对代码整合函数内容并不进行说明。
- import time
- from concurrent.futures import ThreadPoolExecutor
-
- import requests
- import re
- import csv
- from bs4 import BeautifulSoup
- import os
- from lxml import etree
-
- def top250():
- #re实例1
- for a in range(0,250,25):
- url="https://movie.douban.com/top250?start={0}".format(a)
- #print(url)
- headers={
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
- }
- resp = requests.get(url,headers=headers)#get模式带UA防反爬
- page_content=resp.text
- obj=re.compile(r'<li>.*?<em class="">(?P<ranking>.*?)</em>'
- r'.*?<span class="title">(?P<name>.*?)</span>.*?'
- r'.*?<br>(?P<year>.*?) .*?'
- r'.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>'
- r'.*?<span>(?P<number>.*?)</span>'
- ,re.S)
- result= obj.finditer(page_content)
-
- f=open("DoubanTop250.csv",mode="a")
- csvwriter=csv.writer(f)
-
- for i in result:
- # print("排名:{0}; 电影名:{1}; 年份:{2}; 评分:{3} ".format(
- # i.group("ranking"),
- # i.group("name"),
- # i.group("year").strip(),
- # i.group("score"))
- # )
- dic=i.groupdict()
- dic['year']=dic['year'].strip()
- csvwriter.writerow(dic.values())
- f.close()
- time.sleep(2)#多个页面采用时延防反爬,不然ip会被ban掉
- print("收集到{0}个信息".format(a+25))
-
- def MovieDownload():
- #re实例2
- domain ="https://dytt89.com/"
- resp = requests.get(domain,verify=False)#get中特殊的verify=False处理
- resp.encoding='gbk'#国标语言
-
- #print(resp.text)
-
- f = open("Dytt2022新片精品电影下载地址.csv", mode="a")
- csvwriter = csv.writer(f)
-
- obj1=re.compile(r'2022新片精品.*?'
- r'<ul>(?P<ul>.*?)</ul>',re.S)
-
- obj2=re.compile(r'''<li><a href='(?P<href>.*?)' title="''',re.S)
-
- obj3=re.compile(r'<div class="title_all"><h1>(?P<movie>.*?)</h1></div>.*?'
- r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf">'
- r'<a href="(?P<download>.*?)">',re.S)
- child_href_list=[]
- result1=obj1.finditer(resp.text)
- for i in result1:#第一层————获取主页面代码
- ul=i.group('ul')
- #print(ul)
- #time.sleep(1)
- result2=obj2.finditer(ul)
-
- for j in result2:#第二层————获取进入后的代码
- #time.sleep(1)
- child_href=domain+j.group('href').strip("/")#主页面代码与子页面的特殊部分拼接
- child_href_list.append(child_href)
- k=0
- for href in child_href_list:
- child_resp=requests.get(href,verify=False)
- child_resp.encoding='gbk'
- result3=obj3.search(child_resp.text)#获取所需的下载内容
- #print(result3.group('movie'))
- #print(result3.group('download'))
- dic = result3.groupdict()
- csvwriter.writerow(dic.values())
- k=k+1#计数
- print("已收集到{0}个电影".format(k))
- f.close()
-
- def VegetableValue():
- #POST实例
- url = "http://www.xinfadi.com.cn/getPriceData.html"
- #由于页面源代码没有数据,所以F12用抓包工具获取到数据的链接使用即可
- head = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
- }
-
- f = open("菜价.csv", mode="a")
- csvwriter = csv.writer(f)
-
- data = {'limit': 3} # 所要统计的数据数量
- #get和post获取数据的区别
- #get只能直接获取当前页面内所有资源
- #post可以控制所要获取的资源数量data
- #resp = requests.get(url, headers=head).json()
- resp = requests.post(url, headers=head, data=data).json()
- lis = resp.get('list')
-
- for i in lis:
- name = i.get("prodName")
- low_price = i.get("lowPrice")
- high_price = i.get("highPrice")
- average_price = i.get("avgPrice")
- producing_area = i.get("place")
- unit = i.get("unitInfo")
- date = i.get("pubDate")
-
- csvwriter.writerow([name,low_price,high_price,average_price,producing_area,unit,date])
- f.close()
-
- def CatchPicture(url):
- #bs4实例
- url_download = "https://pic.netbian.com/"
- resp = requests.get(url)
- resp.encoding="gbk"
- #print(resp.text)
- main_page=BeautifulSoup(resp.text,"html.parser")
- alist=main_page.find("div",class_="slist").find_all("a")
- #print(alist)
- for a in alist:
- #print(a.get('href'))
- href = url_download + a.get('href').strip("/") # 主页面代码与子页面的特殊部分拼接
- #print(href)
- child_page_resp=requests.get(href)
- child_page_resp.encoding='gbk'
- child_page_text=child_page_resp.text
- child_page=BeautifulSoup(child_page_text,"html.parser")
- img = child_page.find("div",class_="photo").find("img")
- img_name= child_page.find("div",class_="photo").find("img").get("title")
- #print(img.get("src"))
- src=url_download+img.get("src").strip("/")
- #print(src)
- #print(img_name)
- img_resp=requests.get(src)
- img = img_resp.content#拿到字节
- with open("img2/"+img_name+".jpeg",mode="wb")as f:
- f.write(img)
-
- print(img_name+"下载好了!!")
- #break
- time.sleep(0.5)#防反爬必要时延
-
- def Xpath():
- #网站不知道为啥总是返回空列表找不到数据,但是自己做的html却很轻松能访问到节点目标
-
- tree = etree.parse('file:///C:/Users/86183/Desktop/1.html')
- r1 = tree.xpath('/html/body/div[2]/p') # 直接从上往下挨着找节点
- # /html/body/div[2]/p[1]
- for div in r1:
- # /html/body/div[2]/p[1]
- a = div.xpath('./text()')
- print(a)
- #浏览器中的console调用xpath的基本格式:$x("xpath表达式"),若格式正确则返回值,错误则无
-
- def Vidio():
- #梨视频防盗链破解
- url = "https://www.pearvideo.com/video_1733893"#拉取视频地址
- contId = url.split("_")[1]#拿到1733893
-
- resp = requests.get(url)
- resp.encoding="utf-8"
- #print(resp.text)
- main_page = BeautifulSoup(resp.text,"html.parser")
- title = main_page.find("div",class_="box-left clear-mar").find("h1").text
- #print(title)
-
- header = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
- # referer防盗链:溯源,访问顺序必须是1->2->3,所以加个referer模拟该状态;若是只有1->3则失效
- ,"Referer": url
- }
- vidio_status = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.5623242368176109"
- resp = requests.get(vidio_status,headers=header)
- #print(resp.text)
- dic = resp.json()
- #print(dic)
-
- srcUrl = dic["videoInfo"]["videos"]['srcUrl']
- systemTime = dic['systemTime']#systemTime:1660186591481
- # 假:https://video.pearvideo.com/mp4/adshort/20210701/1660186531481-15708981_adpkg-ad_hd.mp4
- # 真:https://video.pearvideo.com/mp4/adshort/20210701/cont-1733893-15708981_adpkg-ad_hd.mp4
- srcUrl_true=srcUrl.replace(systemTime,f"cont-{contId}")#将两个链接不同处对比替换掉旧链接的无用部分
- #print(srcUrl_true)
- with open("videos/"+title+".mp4",mode= "wb")as f:
- f.write(requests.get(srcUrl_true).content)
- print(title+"下载完成!")
-
-
-
-
- def aiodownload(cid,title,book):
-
- url = f"https://www.23qb.com/book/{cid}.html"
- page = 1
- with open(f"novels/{book}.txt", mode="a+") as f:
- f.write("\n")
- f.write("\n"+title+"\n")
- f.write("\n")
- while True:
- resp = requests.get(url).text
- page_thing = BeautifulSoup(resp.replace('\ufffd', ''), "html.parser")
- lists = page_thing.find_all("div", class_="read-content")
-
- for texts in lists:
- text = texts.find_all('p')
- del text[-1]
- if text[-1].string == "(继续下一页)":
- del text[-1]
- page = page+1
-
- url = f"https://www.23qb.com/book/{cid}_{page}.html"
-
- for line in text:
- txt = line.string
-
- try:
- f.write(txt+"\n")
- except Exception as e:
- f.write("!!!!!!!!"+"\n")
- continue
-
- continue
- else:
- for line in text:
- txt = line.string
-
- try:
- f.write(txt + "\n")
- except Exception as e:
- f.write("!!!!!!!!" + "\n")
- continue
-
- break
- print(title + "下载完成")
-
- def getCatalog(url):
- resp = requests.get(url)
-
- #print(resp.text)
- obj1 = re.compile(r'<meta property="og:novel:book_name" content="(?P<book>.*?)"/>.*?'
- r'<ul class="chaw_c" id="chapterList">(?P<url>.*?)</ul>',re.S)
- obj2 = re.compile(r'<li><a href="/book/(?P<c_id>.*?).html">(?P<name>.*?).</a></li>',re.S)
- main_page = resp.text
- result = obj1.finditer(main_page)
- for i in result:
- ul = i.group('url')
- book = i.group("book")
- #print(ul)
- result2= obj2.finditer(ul)
- for ii in result2:
- cid=ii.group("c_id")
- title = ii.group("name")
-
- aiodownload(cid,title,book)
-
- print(book+"下载完成!")
-
-
- if __name__ == '__main__':
- #top250()
- #MovieDownload()
- #VegetableValue()
-
- # CatchPicture主函数
- '''
- start_time = time.time()
- with ThreadPoolExecutor(10) as t:#这里只开启了20个线程,可以更多
- for i in range (2,119):
- t.submit(CatchPicture,f"https://pic.netbian.com/4kdongman/index_{i}.html")
- time.sleep(1)#多线程记得时延防止反爬禁ip(已经被网站禁了四个ip了谢邀)
- print(f"第{i}页内容下载完毕")#如果控制台只显示此内容,则说明ip已经被网站反爬
- print("全部下载完毕")
- end_time = time.time()
- print('总共的时间为:', round(end_time - start_time, 2), '秒')
- '''
-
- # Xpath()
-
- #Vidio()
-
- # 小说下载
-
- start_time = time.time()
- b_id = "116418" #input("输入你想下载的书的id:")#"60218","27309","4286","719","189697"
- url =f"https://www.23qb.com/book/{b_id}/"
- getCatalog(url)
-
- end_time = time.time()
- print("下载时间为:", round(end_time - start_time, 2), '秒')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。