当前位置:   article > 正文

基于selenium的爬取百度和必应图片_selenium百度图片搜索

selenium百度图片搜索

一、前提准备

        1.下载selenium、requests库

  1. pip install requests
  2. pip install selenium

        2.在自己目录下创建好dataset目录,并再其下在创建两个文件夹分为命名Image,HrefJson (否则需要自己改动里面的代码)

 注意:使用selenium前需知道自己浏览器对应的版本驱动!!!(我使用的浏览器为谷歌浏览器,若不是谷歌,需自行修改代码)

 二、代码

  1. #导入资源
  2. from selenium.webdriver import Chrome
  3. from selenium.webdriver.chrome.options import Options
  4. from selenium.webdriver.common.by import By
  5. import time,re,json,os,threading,requests,numpy as np
  6. """
  7. 全局静态池
  8. """
  9. #书法网站url name:{url:'',}
  10. ConfigMap = {
  11. '百度图片':{
  12. 'Search':{
  13. 'Connection_Url':'https://image.baidu.com/', #百度引擎的URL
  14. 'Kw':'书法画作品', #所要查询的内容
  15. 'Search_InputID':'kw', #查询的搜索框ID
  16. 'BtnClass':'s_newBtn', #查询按钮ClassName
  17. 'Count':2, #动态下滑次数
  18. 'Destion_Class':'//img[@class="main_img img-hover"]' #爬取的图片资源类名
  19. },
  20. 'Thread':{
  21. 'ThreadID': 1,
  22. 'ThreadName':'task1',
  23. },
  24. 'SaveFileName':'BaiduPhotoData',
  25. 'SRCList':[]
  26. },
  27. '必应图片':{
  28. 'Search':{
  29. 'Connection_Url':'https://cn.bing.com/images/search?q=书法画&first=1',
  30. 'Kw':'作品',
  31. 'Search_InputID':'sb_form_q',
  32. 'BtnClass':'b_searchboxSubmit',
  33. 'Count':2,
  34. 'Destion_Class':'//img[@class="mimg vimgld"]' #必应的书法画作品
  35. },
  36. 'Thread': {
  37. 'ThreadID': 2,
  38. 'ThreadName': 'task2',
  39. },
  40. 'SaveFileName': 'BiYingPhotoData',
  41. 'SRCList': []
  42. }
  43. }
  44. def SearchPhotoMSGGet(threadname,Key,Connection_Url,Kw,Search_InputID,BtnClass,Count,Destion_Class):
  45. web = LoadChormeWeb()
  46. #
  47. web.get(Connection_Url)
  48. search_input = web.find_element_by_id(Search_InputID)
  49. search_input.send_keys(Kw) #获取百度输入框
  50. js = 'window.scrollTo(0,document.body.scrollHeight)' #输入搜索字段
  51. time.sleep(2)
  52. #
  53. btn = web.find_element_by_class_name(BtnClass) #
  54. btn.click()
  55. time.sleep(2)
  56. # 动态下滑次数
  57. for count in range(1,Count):
  58. web.execute_script(js)
  59. time.sleep(2)
  60. #遍历图片ClassName的列表'//img[@class="main_img img-hover"]'
  61. ImgList = web.find_elements(By.XPATH,Destion_Class)
  62. for Img in ImgList:
  63. src_value = Img.get_attribute('src')
  64. #除去不合格的
  65. isHTTPS = AddressFormatHeader(str(src_value))
  66. if isHTTPS:
  67. ConfigMap[Key]['SRCList'].append(src_value)
  68. print(threadname+':'+'-'*15+"图片资源网址获取成功!"+'-'*15)
  69. web.quit()
  70. time.sleep(10)
  71. #把图片资源网址保存成文件
  72. def SavePhotoURLFile(threadname,filename,SRCList):
  73. # 资源链接保存数据
  74. SrcSaveMap = {
  75. 'author': '柒七',
  76. 'description': '书法画数据集',
  77. 'keyword': '书法画作品',
  78. 'data': []
  79. }
  80. for i,src in zip(range(1,len(SRCList)), SRCList):
  81. # 每次新的
  82. srcmap = {'index':i,'href':src}
  83. SrcSaveMap['data'].append(srcmap)
  84. # 为空则创建
  85. if CheckIfFileEmpty(os.path.abspath('dataset/HrefJson/{}.json'.format(filename))):
  86. Write(SrcSaveMap,os.path.abspath('dataset/HrefJson/{}.json'.format(filename)),'w')
  87. #不为空处理
  88. else:
  89. old_data = Read(os.path.abspath('dataset/HrefJson/{}.json'.format(filename)),'r')
  90. for i in SrcSaveMap['data']:
  91. for href in i.values():
  92. if not href in old_data:
  93. old_data['data']['href'].append(i)
  94. print(threadname+':'+'-'*15,"图片资源文件已保存",'-'*15)
  95. time.sleep(10)
  96. # 下载图片
  97. def DownloadPhoto(image_path,folder_path):
  98. ImageIndex = 1
  99. for root,dirs,files in os.walk(folder_path):
  100. if len(files) > 0:
  101. for file in files:
  102. data = Read(f'{folder_path}\\{file}','r')
  103. print('\n', '-' * 15, "开始下载!", '-' * 15)
  104. for Src in data['data']:
  105. response = requests.get(Src['href'])
  106. with open(image_path.format(ImageIndex),'wb') as f:
  107. f.write(response.content)
  108. f.close()
  109. print(f"图片 {Src['href']} 下载完成\n,保存为 {image_path.format(ImageIndex)}")
  110. ImageIndex+=1
  111. time.sleep(1)
  112. """
  113. 工具
  114. """
  115. # 打开浏览器
  116. def LoadChormeWeb():
  117. #
  118. options = Options()
  119. options.add_argument("--headless")
  120. options.headless = False
  121. #
  122. web = Chrome(executable_path="D:/DevelopProgramming/002D-Tool/Anacode3/chromedriver.exe", options=options)
  123. web.implicitly_wait(10)
  124. return web
  125. def CheckIfFileEmpty(filename):
  126. if not os.path.isfile(filename) or os.stat(filename).st_size == 0:
  127. return True
  128. data = Read(filename,'r')
  129. if len(data) > 0:
  130. return False
  131. else:
  132. return True
  133. # 文件的写入
  134. def Write(Data,Filename,mode):
  135. with open(Filename, mode) as f:
  136. json.dump(Data, f,indent=4)
  137. f.close() #释放资源
  138. #文件的数据读取
  139. def Read(Filename,mode):
  140. with open(Filename, mode) as f:
  141. data = json.load(f)
  142. f.close()
  143. return data
  144. """
  145. 线程池
  146. """
  147. def asyn_method():
  148. # 线程列表
  149. tasks=[]
  150. # 图片链接获取任务开启
  151. def Task1():
  152. print('task1:', '-' * 15, '开始工作!', '-' * 15)
  153. SearchPhotoMSGGet(ConfigMap['百度图片']['Thread']['ThreadName'],'百度图片',**ConfigMap['百度图片']['Search'])
  154. print('task1:','-'*15,'百度图片爬取完成!','-'*15)
  155. SavePhotoURLFile(ConfigMap['百度图片']['Thread']['ThreadName'],ConfigMap['百度图片']['SaveFileName'],ConfigMap['百度图片']['SRCList'])
  156. def Task2():
  157. print('task2:', '-' * 15, '开始工作!', '-' * 15)
  158. SearchPhotoMSGGet(ConfigMap['必应图片']['Thread']['ThreadName'],'必应图片',**ConfigMap['必应图片']['Search'])
  159. print('task2:', '-' * 15, '必应图片爬取完成!', '-' * 15)
  160. SavePhotoURLFile(ConfigMap['必应图片']['Thread']['ThreadName'], ConfigMap['必应图片']['SaveFileName'],
  161. ConfigMap['必应图片']['SRCList'])
  162. task1 = threading.Thread(target=Task1,daemon=True,name=ConfigMap['百度图片']['Thread']['ThreadName'])
  163. task2 = threading.Thread(target=Task2,daemon=True,name=ConfigMap['必应图片']['Thread']['ThreadName'])
  164. tasks.append(task1)
  165. tasks.append(task2)
  166. try:
  167. task1.start()
  168. task2.start()
  169. except Exception as e:
  170. print("线程异常:", str(e))
  171. finally:
  172. for th in tasks:
  173. th.join()
  174. if __name__ == '__main__':
  175. asyn_method()
  176. DownloadPhoto(os.path.abspath('dataset/Image/Painting{}.jpg'),os.path.abspath('dataset/HrefJson/'))

今天的内容就到这里,谢谢大家!!!!(注:可以继续深化下去,比如说对图片的处理)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/爱喝兽奶帝天荒/article/detail/801150
推荐阅读
相关标签
  

闽ICP备14008679号