当前位置:   article > 正文

python爬虫代码 —— 无套路,直接用_爬虫代码大全可复制免费

爬虫代码大全可复制免费

因发帖规则,如需爬取其他各大网站请详细看我发布的文章

本代码仅供教学使用,任何非法行为与本帖无关

 

超级规范的代码 —— 供大家免费使用、教学(百度)

  1. #!/usr/bin/env python
  2. # -*- coding:utf-8 -*-
  3. import argparse
  4. import os
  5. import re
  6. import sys
  7. import urllib
  8. import json
  9. import socket
  10. import urllib.request
  11. import urllib.parse
  12. import urllib.error
  13. # 设置超时
  14. import time
  15. timeout = 5
  16. socket.setdefaulttimeout(timeout)
  17. class Crawler:
  18. # 睡眠时长
  19. __time_sleep = 0.1
  20. __amount = 0
  21. __start_amount = 0
  22. __counter = 0
  23. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Cookie': ''}
  24. __per_page = 30
  25. # 获取图片url内容等
  26. # t 下载图片时间间隔
  27. def __init__(self, t=0.1):
  28. self.time_sleep = t
  29. # 获取后缀名
  30. @staticmethod
  31. def get_suffix(name):
  32. m = re.search(r'\.[^\.]*$', name)
  33. if m.group(0) and len(m.group(0)) <= 5:
  34. return m.group(0)
  35. else:
  36. return '.jpg'
  37. @staticmethod
  38. def handle_baidu_cookie(original_cookie, cookies):
  39. """
  40. :param string original_cookie:
  41. :param list cookies:
  42. :return string:
  43. """
  44. if not cookies:
  45. return original_cookie
  46. result = original_cookie
  47. for cookie in cookies:
  48. result += cookie.split(';')[0] + ';'
  49. result.rstrip(';')
  50. return result
  51. # 保存图片
  52. def save_image(self, rsp_data, word):
  53. if not os.path.exists("./" + word):
  54. os.mkdir("./" + word)
  55. # 判断名字是否重复,获取图片长度
  56. self.__counter = len(os.listdir('./' + word)) + 1
  57. for image_info in rsp_data['data']:
  58. try:
  59. if 'replaceUrl' not in image_info or len(image_info['replaceUrl']) < 1:
  60. continue
  61. obj_url = image_info['replaceUrl'][0]['ObjUrl']
  62. thumb_url = image_info['thumbURL']
  63. url = 'https://image.baidu.com/search/down?tn=download&ipn=dwnl&word=download&ie=utf8&fr=result&url=%s&thumburl=%s' % (urllib.parse.quote(obj_url), urllib.parse.quote(thumb_url))
  64. time.sleep(self.time_sleep)
  65. suffix = self.get_suffix(obj_url)
  66. # 指定UA和referrer,减少403
  67. opener = urllib.request.build_opener()
  68. opener.addheaders = [
  69. ('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'),
  70. ]
  71. urllib.request.install_opener(opener)
  72. # 保存图片
  73. filepath = './%s/%s' % (word, str(self.__counter) + str(suffix))
  74. urllib.request.urlretrieve(url, filepath)
  75. if os.path.getsize(filepath) < 5:
  76. print("下载到了空文件,跳过!")
  77. os.unlink(filepath)
  78. continue
  79. except urllib.error.HTTPError as urllib_err:
  80. print(urllib_err)
  81. continue
  82. except Exception as err:
  83. time.sleep(1)
  84. print(err)
  85. print("产生未知错误,放弃保存")
  86. continue
  87. else:
  88. print("小黄图+1,已有" + str(self.__counter) + "张小黄图")
  89. self.__counter += 1
  90. return
  91. # 开始获取
  92. def get_images(self, word):
  93. search = urllib.parse.quote(word)
  94. # 查询关键字
  95. # pn int 图片数
  96. pn = self.__start_amount
  97. while pn < self.__amount:
  98. url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%s&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%s&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=%s&rn=%d&gsm=1e&1594447993172=' % (search, search, str(pn), self.__per_page)
  99. # 设置header防403
  100. try:
  101. time.sleep(self.time_sleep)
  102. req = urllib.request.Request(url=url, headers=self.headers)
  103. page = urllib.request.urlopen(req)
  104. self.headers['Cookie'] = self.handle_baidu_cookie(self.headers['Cookie'], page.info().get_all('Set-Cookie'))
  105. rsp = page.read()
  106. page.close()
  107. except UnicodeDecodeError as e:
  108. print(e)
  109. print('-----UnicodeDecodeErrorurl:', url)
  110. except urllib.error.URLError as e:
  111. print(e)
  112. print("-----urlErrorurl:", url)
  113. except socket.timeout as e:
  114. print(e)
  115. print("-----socket timout:", url)
  116. else:
  117. # 解析json
  118. rsp_data = json.loads(rsp, strict=False)
  119. if 'data' not in rsp_data:
  120. print("触发了反爬机制,自动重试!")
  121. else:
  122. self.save_image(rsp_data, word)
  123. # 读取下一页
  124. print("下载下一页")
  125. pn += self.__per_page
  126. return
  127. def start(self, word, total_page=1, start_page=1, per_page=30):
  128. """
  129. 爬虫入口
  130. :param word: 抓取的关键词
  131. :param total_page: 需要抓取数据页数 总抓取图片数量为 页数 x per_page
  132. :param start_page:起始页码
  133. :param per_page: 每页数量
  134. :return:
  135. """
  136. self.__per_page = per_page
  137. self.__start_amount = (start_page - 1) * self.__per_page
  138. # 起始图片序号
  139. self.__amount = total_page * self.__per_page + self.__start_amount
  140. # 一共需要在网页上爬图片的数量
  141. self.get_images(word)
  142. if __name__ == '__main__':
  143. if len(sys.argv) > 1:
  144. parser = argparse.ArgumentParser()
  145. parser.add_argument("-w", "--word", type=str, help="抓取关键词", required=True)
  146. parser.add_argument("-tp", "--total_page", type=int, help="需要抓取的总页数", required=True)
  147. parser.add_argument("-sp", "--start_page", type=int, help="起始页数", required=True)
  148. parser.add_argument("-pp", "--per_page", type=int, help="每页大小", choices=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100], default=30, nargs='?')
  149. parser.add_argument("-d", "--delay", type=float, help="抓取延时(间隔)", default=0.05)
  150. args = parser.parse_args()
  151. crawler = Crawler(args.delay)
  152. crawler.start(args.word, args.total_page, args.start_page, args.per_page) # 抓取关键词为 “美女”,总数为 1 页(即总共 1*60=60 张),开始页码为 2
  153. else:
  154. # 如果不指定参数,那么程序会按照下面进行执行
  155. crawler = Crawler(0.05) # 抓取延迟为 0.05
  156. crawler.start('迫击炮手', 40, 2, 30) # 抓取关键词为 “美女”,总数为 1 页,开始页码为 2,每页30张(即总共 2*30=60 张)
  157. # crawler.start('手机', 10, 1) # 抓取关键词为 “二次元 美女”
  158. # crawler.start('', 5) # 抓取关键词为 “帅哥”

最后一句为你要查询的图片名字,还需详细了解请看代码

 使用效果图

96efbd59ada74622af332c1a55288adb.png

还有什么问题可在评论区提问

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/756076
推荐阅读
相关标签
  

闽ICP备14008679号