赞
踩
1.下载selenium、requests库
- pip install requests
- pip install selenium
2.在自己目录下创建好dataset目录,并再其下在创建两个文件夹分为命名Image,HrefJson (否则需要自己改动里面的代码)
注意:使用selenium前需知道自己浏览器对应的版本驱动!!!(我使用的浏览器为谷歌浏览器,若不是谷歌,需自行修改代码)
- #导入资源
- from selenium.webdriver import Chrome
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.by import By
- import time,re,json,os,threading,requests,numpy as np
-
- """
- 全局静态池
- """
- #书法网站url name:{url:'',}
- ConfigMap = {
- '百度图片':{
- 'Search':{
- 'Connection_Url':'https://image.baidu.com/', #百度引擎的URL
- 'Kw':'书法画作品', #所要查询的内容
- 'Search_InputID':'kw', #查询的搜索框ID
- 'BtnClass':'s_newBtn', #查询按钮ClassName
- 'Count':2, #动态下滑次数
- 'Destion_Class':'//img[@class="main_img img-hover"]' #爬取的图片资源类名
- },
- 'Thread':{
- 'ThreadID': 1,
- 'ThreadName':'task1',
- },
- 'SaveFileName':'BaiduPhotoData',
- 'SRCList':[]
- },
- '必应图片':{
- 'Search':{
- 'Connection_Url':'https://cn.bing.com/images/search?q=书法画&first=1',
- 'Kw':'作品',
- 'Search_InputID':'sb_form_q',
- 'BtnClass':'b_searchboxSubmit',
- 'Count':2,
- 'Destion_Class':'//img[@class="mimg vimgld"]' #必应的书法画作品
- },
- 'Thread': {
- 'ThreadID': 2,
- 'ThreadName': 'task2',
- },
- 'SaveFileName': 'BiYingPhotoData',
- 'SRCList': []
- }
-
- }
- def SearchPhotoMSGGet(threadname,Key,Connection_Url,Kw,Search_InputID,BtnClass,Count,Destion_Class):
- web = LoadChormeWeb()
- #
- web.get(Connection_Url)
- search_input = web.find_element_by_id(Search_InputID)
- search_input.send_keys(Kw) #获取百度输入框
- js = 'window.scrollTo(0,document.body.scrollHeight)' #输入搜索字段
- time.sleep(2)
- #
- btn = web.find_element_by_class_name(BtnClass) #
- btn.click()
- time.sleep(2)
-
- # 动态下滑次数
- for count in range(1,Count):
- web.execute_script(js)
- time.sleep(2)
- #遍历图片ClassName的列表'//img[@class="main_img img-hover"]'
- ImgList = web.find_elements(By.XPATH,Destion_Class)
- for Img in ImgList:
- src_value = Img.get_attribute('src')
- #除去不合格的
- isHTTPS = AddressFormatHeader(str(src_value))
- if isHTTPS:
- ConfigMap[Key]['SRCList'].append(src_value)
- print(threadname+':'+'-'*15+"图片资源网址获取成功!"+'-'*15)
- web.quit()
- time.sleep(10)
- #把图片资源网址保存成文件
- def SavePhotoURLFile(threadname,filename,SRCList):
- # 资源链接保存数据
- SrcSaveMap = {
- 'author': '柒七',
- 'description': '书法画数据集',
- 'keyword': '书法画作品',
- 'data': []
- }
- for i,src in zip(range(1,len(SRCList)), SRCList):
- # 每次新的
- srcmap = {'index':i,'href':src}
- SrcSaveMap['data'].append(srcmap)
- # 为空则创建
- if CheckIfFileEmpty(os.path.abspath('dataset/HrefJson/{}.json'.format(filename))):
- Write(SrcSaveMap,os.path.abspath('dataset/HrefJson/{}.json'.format(filename)),'w')
- #不为空处理
- else:
- old_data = Read(os.path.abspath('dataset/HrefJson/{}.json'.format(filename)),'r')
- for i in SrcSaveMap['data']:
- for href in i.values():
- if not href in old_data:
- old_data['data']['href'].append(i)
- print(threadname+':'+'-'*15,"图片资源文件已保存",'-'*15)
- time.sleep(10)
- # 下载图片
- def DownloadPhoto(image_path,folder_path):
- ImageIndex = 1
- for root,dirs,files in os.walk(folder_path):
- if len(files) > 0:
- for file in files:
- data = Read(f'{folder_path}\\{file}','r')
- print('\n', '-' * 15, "开始下载!", '-' * 15)
- for Src in data['data']:
- response = requests.get(Src['href'])
- with open(image_path.format(ImageIndex),'wb') as f:
- f.write(response.content)
- f.close()
- print(f"图片 {Src['href']} 下载完成\n,保存为 {image_path.format(ImageIndex)}")
- ImageIndex+=1
-
- time.sleep(1)
-
- """
- 工具
- """
- # 打开浏览器
- def LoadChormeWeb():
- #
- options = Options()
- options.add_argument("--headless")
- options.headless = False
- #
- web = Chrome(executable_path="D:/DevelopProgramming/002D-Tool/Anacode3/chromedriver.exe", options=options)
- web.implicitly_wait(10)
- return web
- def CheckIfFileEmpty(filename):
- if not os.path.isfile(filename) or os.stat(filename).st_size == 0:
- return True
- data = Read(filename,'r')
- if len(data) > 0:
- return False
- else:
- return True
- # 文件的写入
- def Write(Data,Filename,mode):
- with open(Filename, mode) as f:
- json.dump(Data, f,indent=4)
- f.close() #释放资源
- #文件的数据读取
- def Read(Filename,mode):
- with open(Filename, mode) as f:
- data = json.load(f)
- f.close()
- return data
-
- """
- 线程池
- """
- def asyn_method():
- # 线程列表
- tasks=[]
- # 图片链接获取任务开启
- def Task1():
- print('task1:', '-' * 15, '开始工作!', '-' * 15)
- SearchPhotoMSGGet(ConfigMap['百度图片']['Thread']['ThreadName'],'百度图片',**ConfigMap['百度图片']['Search'])
- print('task1:','-'*15,'百度图片爬取完成!','-'*15)
- SavePhotoURLFile(ConfigMap['百度图片']['Thread']['ThreadName'],ConfigMap['百度图片']['SaveFileName'],ConfigMap['百度图片']['SRCList'])
- def Task2():
- print('task2:', '-' * 15, '开始工作!', '-' * 15)
- SearchPhotoMSGGet(ConfigMap['必应图片']['Thread']['ThreadName'],'必应图片',**ConfigMap['必应图片']['Search'])
- print('task2:', '-' * 15, '必应图片爬取完成!', '-' * 15)
- SavePhotoURLFile(ConfigMap['必应图片']['Thread']['ThreadName'], ConfigMap['必应图片']['SaveFileName'],
- ConfigMap['必应图片']['SRCList'])
-
- task1 = threading.Thread(target=Task1,daemon=True,name=ConfigMap['百度图片']['Thread']['ThreadName'])
- task2 = threading.Thread(target=Task2,daemon=True,name=ConfigMap['必应图片']['Thread']['ThreadName'])
-
- tasks.append(task1)
- tasks.append(task2)
- try:
- task1.start()
- task2.start()
-
- except Exception as e:
- print("线程异常:", str(e))
- finally:
- for th in tasks:
- th.join()
- if __name__ == '__main__':
- asyn_method()
- DownloadPhoto(os.path.abspath('dataset/Image/Painting{}.jpg'),os.path.abspath('dataset/HrefJson/'))

今天的内容就到这里,谢谢大家!!!!(注:可以继续深化下去,比如说对图片的处理)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。