当前位置:   article > 正文

Python爬虫相关案例汇总_python爬虫实例

python爬虫实例

一、简述

将之前所做的爬虫案例放出,方便查阅,对代码整合函数内容并不进行说明。

二、代码

  1. import time
  2. from concurrent.futures import ThreadPoolExecutor
  3. import requests
  4. import re
  5. import csv
  6. from bs4 import BeautifulSoup
  7. import os
  8. from lxml import etree
  9. def top250():
  10. #re实例1
  11. for a in range(0,250,25):
  12. url="https://movie.douban.com/top250?start={0}".format(a)
  13. #print(url)
  14. headers={
  15. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
  16. }
  17. resp = requests.get(url,headers=headers)#get模式带UA防反爬
  18. page_content=resp.text
  19. obj=re.compile(r'<li>.*?<em class="">(?P<ranking>.*?)</em>'
  20. r'.*?<span class="title">(?P<name>.*?)</span>.*?'
  21. r'.*?<br>(?P<year>.*?)&nbsp.*?'
  22. r'.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>'
  23. r'.*?<span>(?P<number>.*?)</span>'
  24. ,re.S)
  25. result= obj.finditer(page_content)
  26. f=open("DoubanTop250.csv",mode="a")
  27. csvwriter=csv.writer(f)
  28. for i in result:
  29. # print("排名:{0}; 电影名:{1}; 年份:{2}; 评分:{3} ".format(
  30. # i.group("ranking"),
  31. # i.group("name"),
  32. # i.group("year").strip(),
  33. # i.group("score"))
  34. # )
  35. dic=i.groupdict()
  36. dic['year']=dic['year'].strip()
  37. csvwriter.writerow(dic.values())
  38. f.close()
  39. time.sleep(2)#多个页面采用时延防反爬,不然ip会被ban掉
  40. print("收集到{0}个信息".format(a+25))
  41. def MovieDownload():
  42. #re实例2
  43. domain ="https://dytt89.com/"
  44. resp = requests.get(domain,verify=False)#get中特殊的verify=False处理
  45. resp.encoding='gbk'#国标语言
  46. #print(resp.text)
  47. f = open("Dytt2022新片精品电影下载地址.csv", mode="a")
  48. csvwriter = csv.writer(f)
  49. obj1=re.compile(r'2022新片精品.*?'
  50. r'<ul>(?P<ul>.*?)</ul>',re.S)
  51. obj2=re.compile(r'''<li><a href='(?P<href>.*?)' title="''',re.S)
  52. obj3=re.compile(r'<div class="title_all"><h1>(?P<movie>.*?)</h1></div>.*?'
  53. r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf">'
  54. r'<a href="(?P<download>.*?)">',re.S)
  55. child_href_list=[]
  56. result1=obj1.finditer(resp.text)
  57. for i in result1:#第一层————获取主页面代码
  58. ul=i.group('ul')
  59. #print(ul)
  60. #time.sleep(1)
  61. result2=obj2.finditer(ul)
  62. for j in result2:#第二层————获取进入后的代码
  63. #time.sleep(1)
  64. child_href=domain+j.group('href').strip("/")#主页面代码与子页面的特殊部分拼接
  65. child_href_list.append(child_href)
  66. k=0
  67. for href in child_href_list:
  68. child_resp=requests.get(href,verify=False)
  69. child_resp.encoding='gbk'
  70. result3=obj3.search(child_resp.text)#获取所需的下载内容
  71. #print(result3.group('movie'))
  72. #print(result3.group('download'))
  73. dic = result3.groupdict()
  74. csvwriter.writerow(dic.values())
  75. k=k+1#计数
  76. print("已收集到{0}个电影".format(k))
  77. f.close()
  78. def VegetableValue():
  79. #POST实例
  80. url = "http://www.xinfadi.com.cn/getPriceData.html"
  81. #由于页面源代码没有数据,所以F12用抓包工具获取到数据的链接使用即可
  82. head = {
  83. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
  84. }
  85. f = open("菜价.csv", mode="a")
  86. csvwriter = csv.writer(f)
  87. data = {'limit': 3} # 所要统计的数据数量
  88. #get和post获取数据的区别
  89. #get只能直接获取当前页面内所有资源
  90. #post可以控制所要获取的资源数量data
  91. #resp = requests.get(url, headers=head).json()
  92. resp = requests.post(url, headers=head, data=data).json()
  93. lis = resp.get('list')
  94. for i in lis:
  95. name = i.get("prodName")
  96. low_price = i.get("lowPrice")
  97. high_price = i.get("highPrice")
  98. average_price = i.get("avgPrice")
  99. producing_area = i.get("place")
  100. unit = i.get("unitInfo")
  101. date = i.get("pubDate")
  102. csvwriter.writerow([name,low_price,high_price,average_price,producing_area,unit,date])
  103. f.close()
  104. def CatchPicture(url):
  105. #bs4实例
  106. url_download = "https://pic.netbian.com/"
  107. resp = requests.get(url)
  108. resp.encoding="gbk"
  109. #print(resp.text)
  110. main_page=BeautifulSoup(resp.text,"html.parser")
  111. alist=main_page.find("div",class_="slist").find_all("a")
  112. #print(alist)
  113. for a in alist:
  114. #print(a.get('href'))
  115. href = url_download + a.get('href').strip("/") # 主页面代码与子页面的特殊部分拼接
  116. #print(href)
  117. child_page_resp=requests.get(href)
  118. child_page_resp.encoding='gbk'
  119. child_page_text=child_page_resp.text
  120. child_page=BeautifulSoup(child_page_text,"html.parser")
  121. img = child_page.find("div",class_="photo").find("img")
  122. img_name= child_page.find("div",class_="photo").find("img").get("title")
  123. #print(img.get("src"))
  124. src=url_download+img.get("src").strip("/")
  125. #print(src)
  126. #print(img_name)
  127. img_resp=requests.get(src)
  128. img = img_resp.content#拿到字节
  129. with open("img2/"+img_name+".jpeg",mode="wb")as f:
  130. f.write(img)
  131. print(img_name+"下载好了!!")
  132. #break
  133. time.sleep(0.5)#防反爬必要时延
  134. def Xpath():
  135. #网站不知道为啥总是返回空列表找不到数据,但是自己做的html却很轻松能访问到节点目标
  136. tree = etree.parse('file:///C:/Users/86183/Desktop/1.html')
  137. r1 = tree.xpath('/html/body/div[2]/p') # 直接从上往下挨着找节点
  138. # /html/body/div[2]/p[1]
  139. for div in r1:
  140. # /html/body/div[2]/p[1]
  141. a = div.xpath('./text()')
  142. print(a)
  143. #浏览器中的console调用xpath的基本格式:$x("xpath表达式"),若格式正确则返回值,错误则无
  144. def Vidio():
  145. #梨视频防盗链破解
  146. url = "https://www.pearvideo.com/video_1733893"#拉取视频地址
  147. contId = url.split("_")[1]#拿到1733893
  148. resp = requests.get(url)
  149. resp.encoding="utf-8"
  150. #print(resp.text)
  151. main_page = BeautifulSoup(resp.text,"html.parser")
  152. title = main_page.find("div",class_="box-left clear-mar").find("h1").text
  153. #print(title)
  154. header = {
  155. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
  156. # referer防盗链:溯源,访问顺序必须是1->2->3,所以加个referer模拟该状态;若是只有1->3则失效
  157. ,"Referer": url
  158. }
  159. vidio_status = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.5623242368176109"
  160. resp = requests.get(vidio_status,headers=header)
  161. #print(resp.text)
  162. dic = resp.json()
  163. #print(dic)
  164. srcUrl = dic["videoInfo"]["videos"]['srcUrl']
  165. systemTime = dic['systemTime']#systemTime:1660186591481
  166. # 假:https://video.pearvideo.com/mp4/adshort/20210701/1660186531481-15708981_adpkg-ad_hd.mp4
  167. # 真:https://video.pearvideo.com/mp4/adshort/20210701/cont-1733893-15708981_adpkg-ad_hd.mp4
  168. srcUrl_true=srcUrl.replace(systemTime,f"cont-{contId}")#将两个链接不同处对比替换掉旧链接的无用部分
  169. #print(srcUrl_true)
  170. with open("videos/"+title+".mp4",mode= "wb")as f:
  171. f.write(requests.get(srcUrl_true).content)
  172. print(title+"下载完成!")
  173. def aiodownload(cid,title,book):
  174. url = f"https://www.23qb.com/book/{cid}.html"
  175. page = 1
  176. with open(f"novels/{book}.txt", mode="a+") as f:
  177. f.write("\n")
  178. f.write("\n"+title+"\n")
  179. f.write("\n")
  180. while True:
  181. resp = requests.get(url).text
  182. page_thing = BeautifulSoup(resp.replace('\ufffd', ''), "html.parser")
  183. lists = page_thing.find_all("div", class_="read-content")
  184. for texts in lists:
  185. text = texts.find_all('p')
  186. del text[-1]
  187. if text[-1].string == "(继续下一页)":
  188. del text[-1]
  189. page = page+1
  190. url = f"https://www.23qb.com/book/{cid}_{page}.html"
  191. for line in text:
  192. txt = line.string
  193. try:
  194. f.write(txt+"\n")
  195. except Exception as e:
  196. f.write("!!!!!!!!"+"\n")
  197. continue
  198. continue
  199. else:
  200. for line in text:
  201. txt = line.string
  202. try:
  203. f.write(txt + "\n")
  204. except Exception as e:
  205. f.write("!!!!!!!!" + "\n")
  206. continue
  207. break
  208. print(title + "下载完成")
  209. def getCatalog(url):
  210. resp = requests.get(url)
  211. #print(resp.text)
  212. obj1 = re.compile(r'<meta property="og:novel:book_name" content="(?P<book>.*?)"/>.*?'
  213. r'<ul class="chaw_c" id="chapterList">(?P<url>.*?)</ul>',re.S)
  214. obj2 = re.compile(r'<li><a href="/book/(?P<c_id>.*?).html">(?P<name>.*?).</a></li>',re.S)
  215. main_page = resp.text
  216. result = obj1.finditer(main_page)
  217. for i in result:
  218. ul = i.group('url')
  219. book = i.group("book")
  220. #print(ul)
  221. result2= obj2.finditer(ul)
  222. for ii in result2:
  223. cid=ii.group("c_id")
  224. title = ii.group("name")
  225. aiodownload(cid,title,book)
  226. print(book+"下载完成!")
  227. if __name__ == '__main__':
  228. #top250()
  229. #MovieDownload()
  230. #VegetableValue()
  231. # CatchPicture主函数
  232. '''
  233. start_time = time.time()
  234. with ThreadPoolExecutor(10) as t:#这里只开启了20个线程,可以更多
  235. for i in range (2,119):
  236. t.submit(CatchPicture,f"https://pic.netbian.com/4kdongman/index_{i}.html")
  237. time.sleep(1)#多线程记得时延防止反爬禁ip(已经被网站禁了四个ip了谢邀)
  238. print(f"第{i}页内容下载完毕")#如果控制台只显示此内容,则说明ip已经被网站反爬
  239. print("全部下载完毕")
  240. end_time = time.time()
  241. print('总共的时间为:', round(end_time - start_time, 2), '')
  242. '''
  243. # Xpath()
  244. #Vidio()
  245. # 小说下载
  246. start_time = time.time()
  247. b_id = "116418" #input("输入你想下载的书的id:")#"60218","27309","4286","719","189697"
  248. url =f"https://www.23qb.com/book/{b_id}/"
  249. getCatalog(url)
  250. end_time = time.time()
  251. print("下载时间为:", round(end_time - start_time, 2), '秒')

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/720295
推荐阅读
相关标签
  

闽ICP备14008679号