赞
踩
之前在Python爬虫实战(6)中我们曾爬取过《海王》影评,本篇博客的爬取解析过程和之前是完全一样的,唯一不同的是数据存储方式,之前是存储到文件中(csv,txt,json,excel等),这次我们将提取的影评存储到MySql数据库中。下面是海王影评接口:
http://m.maoyan.com/mmdb/comments/movie/249342.json_v=yes&offset=0&startTime=2019-01-01%2000:00:00
其中http://m.maoyan.com是猫眼电影的网址,后面是一个路径,249342是电影《海王》的id,startTime是评论的起始时间,年月日时分秒,%20代表空格。
接口的数据是json格式,一系列的键值对,上图阴影部分是一条影评 。我们关心的字段有cityName,content,id,score,startTime。
首先搭建爬虫主体框架:
- import requests
- import json
- import time
- from datetime import datetime
- from datetime import timedelta
- import pymysql
-
- def get_page(url):
- pass
-
- def parse_page(html):
- pass
-
-
- def write_tofile(comments):
- pass
-
-
- def create_db():
- pass
-
- if __name__ == '__main__':
- create_db() #创建mysql数据库
- '''
- 海王影评接口
- url = 'http://m.maoyan.com/mmdb/comments/movie/249342.json?_v_=yes&offset=0&startTime=2019-01-01%2000:00:00'
- '''
-
- id = '249342' #海王电影id
- start_time = '2019-01-01 00:00:00' #开始时间
- end_time = '2018-01-01 00:00:00' #结束时间
- while start_time > end_time:
- url = 'http://m.maoyan.com/mmdb/comments/movie/'+id+'.json?_v_=yes&offset=0&startTime='+start_time.replace(' ','%20') #将空格替换为%20
-
- #发送请求,获取响应
- try:
- html = get_page(url)
- except Exception:
- time.sleep(0.5) #发生异常时 延时0.5s 避免访问过于频繁
- html = get_page(url)
- else:
- time.sleep(0.1) #没有发生异常时 延时0.1s
- #解析响应内容
- comments = parse_page(html)
- #获取末尾评论的时间
- start_time = comments[14]['startTime']
- #把时间从str转换为datetime类型 减去1s 避免爬取重复数据
- start_time = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
- #再把时间转换为字符串形式
- start_time = datetime.strftime(start_time,'%Y-%m-%d %H:%M:%S')
- #保存数据
- write_tofile(comments)
我们主要爬取18年到19年这一年的影评,所以我们从起始时间开始,每隔1s刷新一下页面,进行爬取。
连接mysql,创建数据库和表:
- def create_db():
- # 创建maoyan数据库
- db = pymysql.connect(host='localhost', user='root', password='', port=3306)
- cursor = db.cursor()
- cursor.execute('CREATE DATABASE IF NOT EXISTS maoyan DEFAULT CHARACTER SET utf8')
- # 连接maoyan数据库
- db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan')
- cursor = db.cursor()
- cursor.execute("DROP TABLE IF EXISTS comments") #删除所有的表
- #创建comments表以及各个字段
- cursor.execute(
- 'CREATE TABLE IF NOT EXISTS comments (id int NOT NULL auto_increment, uid VARCHAR(255), cityName VARCHAR(255) , content VARCHAR(255), score float ,startTime datetime, PRIMARY KEY (id))')
- db.close()
发送请求,获取响应,编写get_page(url)函数:
- def get_page(url):
-
- #添加User-Agent,放在headers中,伪装成浏览器
- headers = {
- 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
- }
- response = requests.get(url,headers=headers)
- if response.status_code == 200:
- response.encoding = response.apparent_encoding
- return response.text
- return None
解析响应,之前都是用正则表达式来解析HTML代码,现在是json格式,很容易解析,因为其内部都是一些键值对,可以像字典一样访问:
- def parse_page(html):
- data = json.loads(html)['cmts'] #将str转换为json cmts是最外层的键名
- comments = []
- for item in data:
- comment = {
- 'id':item['id'],
- 'cityName':item['cityName'] if 'cityName' in item else '',
- 'content':item['content'].replace('\n',' ').replace('\r',' ').replace('\t',' ') #处理评论内容换行的情况
- 'score':item['score'],
- 'startTime':item['startTime']
- }
- comments.append(comment)
- return comments
将数据存储到Mysql数据库:
- def write_tofile(comments):
- db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan') #连接Mysql maoyan数据库
- cursor = db.cursor()
- #插入数据 存储
- sql = 'INSERT INTO comments(uid, cityName, content, score, startTime) values (%s, %s, %s, %s, %s)'
- for item in comments:
- try:
- cursor.execute(sql,list(item.values()))
- db.commit()
- except:
- db.rollback()
- db.close()
爬取效果,首先确保已经安装好MySql数据库和可视化管理工具Navicat:
完整代码:
- import requests
- import json
- import time
- from datetime import datetime
- from datetime import timedelta
- import pymysql
-
- def get_page(url):
-
- #添加User-Agent,放在headers中,伪装成浏览器
- headers = {
- 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
- }
- response = requests.get(url,headers=headers)
- if response.status_code == 200:
- response.encoding = 'utf-8'
- return response.text
- return None
-
- def parse_page(html):
- data = json.loads(html)['cmts'] #将str转换为json cmts是最外层的键名
- comments = []
- for item in data:
- comment = {
- 'id':item['id'],
- 'cityName':item['cityName'] if 'cityName' in item else '',
- 'content':item['content'].replace('\n',' ').replace('\r',' ').replace('\t',' '), #处理评论内容换行的情况
- 'score':item['score'],
- 'startTime':item['startTime']
- }
- comments.append(comment)
- return comments
-
-
- def write_tofile(comments):
- db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan') #连接Mysql maoyan数据库
- cursor = db.cursor()
- #插入数据 存储
- sql = 'INSERT INTO comments(uid, cityName, content, score, startTime) values (%s, %s, %s, %s, %s)'
- for item in comments:
- try:
- cursor.execute(sql,list(item.values()))
- db.commit()
- except:
- db.rollback()
- db.close()
-
-
- def create_db():
- # 创建maoyan数据库
- db = pymysql.connect(host='localhost', user='root', password='', port=3306)
- cursor = db.cursor()
- cursor.execute('CREATE DATABASE IF NOT EXISTS maoyan DEFAULT CHARACTER SET utf8')
- # 连接maoyan数据库
- db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan')
- cursor = db.cursor()
- cursor.execute("DROP TABLE IF EXISTS comments") #删除所有的表
- #创建comments表以及各个字段
- cursor.execute(
- 'CREATE TABLE IF NOT EXISTS comments (id int NOT NULL auto_increment, uid VARCHAR(255), cityName VARCHAR(255) , content VARCHAR(255), score float ,startTime datetime, PRIMARY KEY (id))')
- db.close()
-
-
- if __name__ == '__main__':
- create_db() #创建mysql数据库
- '''
- 海王影评接口
- url = 'http://m.maoyan.com/mmdb/comments/movie/249342.json?_v_=yes&offset=0&startTime=2019-01-01%2000:00:00'
- '''
-
- id = '249342' #海王电影id
- start_time = '2019-01-01 00:00:00' #开始时间
- end_time = '2018-01-01 00:00:00' #结束时间
- while start_time > end_time:
- url = 'http://m.maoyan.com/mmdb/comments/movie/'+id+'.json?_v_=yes&offset=0&startTime='+start_time.replace(' ','%20') #将空格替换为%20
-
- #发送请求,获取响应
- try:
- html = get_page(url)
- except Exception:
- time.sleep(0.5) #发生异常时 延时0.5s 避免访问过于频繁
- html = get_page(url)
- else:
- time.sleep(0.1) #没有发生异常时 延时0.1s
- #解析响应内容
- comments = parse_page(html)
- #获取末尾评论的时间
- start_time = comments[14]['startTime']
- #把时间从str转换为datetime类型 减去1s 避免爬取重复数据
- start_time = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
- #再把时间转换为字符串形式
- start_time = datetime.strftime(start_time,'%Y-%m-%d %H:%M:%S')
- #保存数据
- write_tofile(comments)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。