赞
踩
安装爬虫需要的包
pip install requests
然后写入请求headers,如果不写的话会报错,就进行不下去了
文件头,必须有,否则会安全验证
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
'Host': 'image.baidu.com',
'Referer': 'https://image.baidu.com/search/wiseindex?tn=wiseindex',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'X-Requested-With': 'XMLHttpRequest'
}
然后利用正则找到你想下载图片的地址
img_url_regex = '"thumbURL":"(.*?)",'
然后就是要在第一页加载完之后加载第二页,再加到&pn=后面
完整代码
import requests # 爬虫库
import re # 正则表达式库
import os # 系统库
import time # 时间库
import uploadImg
headers = { # 文件头,必须有,否则会安全验证
"Accept": "application/json, text/javascript, */*; q=0.01",
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
'Host': 'image.baidu.com',
'Referer': 'https://image.baidu.com/search/wiseindex?tn=wiseindex',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'X-Requested-With': 'XMLHttpRequest'
}
url = 'http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=' # 百度链接
keyword = input("请输入图片关键词:")
countmax = eval(input("请输入要爬取的图片数量:"))
url = url+keyword+"&pn="
time_start = time.time() # 获取初始时间
strhtml = requests.get(url, headers=headers) # get方式获取数据
string = str(strhtml.text)
# with open("data.txt", "w", encoding='utf-8') as f: # 这个编码是个问题
# f.write(string) # 这句话自带文件关闭功能,不需要再写f.close()
# print("已爬取,数据存入data.txt")
# 正则表达式取得图片总数量
totalnum = re.findall(
'<div id="resultInfo" style="font-size: 13px;">(.*?)</div>', string)
print("百度图片"+totalnum[0])
img_url_regex = '"thumbURL":"(.*?)",' # 正则匹配式 图片URL
count = 0 # 总共下载的图片数
index = 0 # 链接后面的序号
while (1):
strhtml = requests.get(url+str(index), headers=headers) # get方式获取数据
string = str(strhtml.text)
pic_url = re.findall(img_url_regex, string) # 先利用正则表达式找到图片url
index += len(pic_url) # 网址索引向后,跳到下一页继续搜刮图片
try: # 如果没有文件夹就创建
os.mkdir(r'./' + keyword)
except:
pass
for each in pic_url:
print('正在下载第' + str(count + 1) + '张图片')
try:
if each is not None:
pic = requests.get(each, timeout=1)
else:
continue
except BaseException:
print('错误,当前图片无法下载')
continue
else:
string = keyword + '/' + str(count+1) + '.jpg'
with open(string, 'wb') as fp:
fp.write(pic.content)
fp.close()
count += 1
if countmax == count:
break
if countmax == count:
break
time_end = time.time() # 获取结束时间
print('处理完毕,共耗时:'+str(time_end-time_start)+"秒")
#使用另一个页面的上传方法,这里把输入查询的名称传到方法里,能自动找到该文件夹
uploadImg.uploadImgs(keyword)
接下来就是把爬取的数据上传到服务器
先安装pymysql
然后连接数据库
import pymysql
# 数据库,创建连接
Connection = pymysql.connect(
database='***', host='*****', user='**', password='******')
cursor = Connection.cursor()
接下来是重点上传
完整代码
import pymysql
import os
# 数据库,创建连接
Connection = pymysql.connect(
database='***', host='*****', user='**', password='******')
cursor = Connection.cursor()
sql = "SELECT * from picture" # 获取数据库表中的数据总条数
img_id = cursor.execute(sql)
def uploadImgs(fileName):
# 先读取文件夹,然后再读取图片;
files = os.listdir(fileName)
global img_id # 如果不定义全局的话会报错,方法内的变量不能修改全局的变量
for item in files:
with open('./' + fileName+'/' + item, 'rb') as f:
try:
img_id += 1
sql = "INSERT INTO picture(id,img_src) VALUES (%s,%s)"
# 执行插入图片的sql语句
cursor.execute(
sql, [img_id, '/public/img/picture_' + fileName + str(img_id) + '.jpg'])
Connection.commit() # 提交
except Exception as e:
print(str(e))
# 有异常,回滚事务
Connection.rollback()
# 释放内存
Connection.close()
print('success')
咱也不是专业搞python的,就是闲来无事,参考网上的搞一搞,希望对你有帮助
如果有问题请留言~
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。