赞
踩
使用Scrapy爬取网站图片并下载(使用XPATH路径来获取图片链接)。
对爬取成功的图片连接入库。
# 文件管道(默认桌面路径)
FILES_STORE = rf"{
os.path.join(os.path.expanduser('~'), 'Desktop')}\{
BOT_NAME}"
# 图片管道(默认桌面路径)
IMAGES_STORE = rf"{
os.path.join(os.path.expanduser('~'), 'Desktop')}\{
BOT_NAME}"
# 最小图片高度
IMAGES_MIN_HEIGHT = 0
# 最小图片宽度度
IMAGES_MIN_WIDTH = 0
# 地址
HOST_SQL = '127.0.0.1'
# 端口
PORT_SQL = '3306'
# 用户名
USER_SQL = 'root'
# 密码
PASSWORD_SQL = '123456'
# 数据库
DB_SQL = 'scrapyspider'
# 数据表
TABLE_SQL = 'scrapy_media'
# 错误数据表
TABLE_ERROR_SQL = 'spider_error'
class RandomUserAgentMiddleware:
"""
1、添加user-agent
"""
def __init__(self):
self.agent = UserAgent()
@classmethod
def from_crawler(cls, crawler):
return cls()
def process_request(self, request, spider):
request.headers.setdefault('User-Agent', self.agent.random)
request.headers['referer'] = request.url
def __init__(self, hotsSql: str, portSql: str, userSql: str, passwordSql: str, dbSql: str, tableSql: str, tableErrorSql: str): """ 1、实例化公共参数 :param hotsSql: 地址 :param portSql: 端口 :param userSql: 用户名 :param passwordSql: 密码 :param dbSql: 数据库 :param tableSql: 数据表 :param tableErrorSql: 错误数据表 """ # scrapy数据库链接 self.mySQL = MySQLdb.connect(hotsSql, userSql, passwordSql, dbSql, charset='utf8mb4', port=int(portSql)) self.myCursor = self.mySQL.cursor() self.tableSql = tableSql self.tableErrorSql = tableErrorSql @classmethod def from_crawler(cls, crawler): hotsSql = crawler.settings['HOST_SQL'] portSql = crawler.settings['PORT_SQL'] userSql = crawler.settings['USER_SQL'] passwordSql = crawler.settings['PASSWORD_SQL'] dbSql = crawler.settings['DB_SQL'] tableSql = crawler.settings['TABLE_SQL'] tableErrorSql = crawler.settings['TABLE_ERROR_SQL'] s = cls(hotsSql, portSql, userSql, passwordSql, dbSql, tableSql, tableErrorSql) return s
def process_item(self, item, spider):
# 序列化
items = json.dumps([ItemAdapter(item).asdict()])
# 插入
self.myCursor.execute(
f"""insert into {
self.tableSql}(url,media_url,items) values ("""
f"""%s,%s,%s)""",
[item['url'], item['mediaUrl'], items]
)
self.mySQL.commit()
return item
class MyImagesPipeline(ImagesPipeline): def get_media_requests(self, item, info): """ 1、获取需要下载的图片URL进行下载 :param item: :param info: :return: """ yield scrapy.Request(url=item['mediaUrl'], dont_filter=True) def file_path(self, request, response=None, info=None, *, item=None): """ 1、自定义文件路径 :param request: :param response: :param info: :param item: :return: """ # 图片下载路径(默认桌面路径) self.imagesStore = self.store.basedir if item['mediaDownloadPath'] == '' else item['mediaDownloadPath'] # 保证是一个目录 if (os.path.splitext(self.imagesStore))[1] != '': self.imagesStore = (os.path.splitext(self.imagesStore))[0] if not os.path.isdir(self.imagesStore): os.mkdir(self.imagesStore) # 图片名称(默认使用时间戳) iamgeName = str(time.time(
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。