当前位置:   article > 正文

Python爬虫实战 | (10) 爬取猫眼电影《海王》影评并存入MySql数据库_for item in comments

for item in comments

之前在Python爬虫实战(6)中我们曾爬取过《海王》影评,本篇博客的爬取解析过程和之前是完全一样的,唯一不同的是数据存储方式,之前是存储到文件中(csv,txt,json,excel等),这次我们将提取的影评存储到MySql数据库中。下面是海王影评接口:

http://m.maoyan.com/mmdb/comments/movie/249342.json_v=yes&offset=0&startTime=2019-01-01%2000:00:00

其中http://m.maoyan.com是猫眼电影的网址,后面是一个路径,249342是电影《海王》的id,startTime是评论的起始时间,年月日时分秒,%20代表空格。

接口的数据是json格式,一系列的键值对,上图阴影部分是一条影评 。我们关心的字段有cityName,content,id,score,startTime。

首先搭建爬虫主体框架:

  1. import requests
  2. import json
  3. import time
  4. from datetime import datetime
  5. from datetime import timedelta
  6. import pymysql
  7. def get_page(url):
  8. pass
  9. def parse_page(html):
  10. pass
  11. def write_tofile(comments):
  12. pass
  13. def create_db():
  14. pass
  15. if __name__ == '__main__':
  16. create_db() #创建mysql数据库
  17. '''
  18. 海王影评接口
  19. url = 'http://m.maoyan.com/mmdb/comments/movie/249342.json?_v_=yes&offset=0&startTime=2019-01-01%2000:00:00'
  20. '''
  21. id = '249342' #海王电影id
  22. start_time = '2019-01-01 00:00:00' #开始时间
  23. end_time = '2018-01-01 00:00:00' #结束时间
  24. while start_time > end_time:
  25. url = 'http://m.maoyan.com/mmdb/comments/movie/'+id+'.json?_v_=yes&offset=0&startTime='+start_time.replace(' ','%20') #将空格替换为%20
  26. #发送请求,获取响应
  27. try:
  28. html = get_page(url)
  29. except Exception:
  30. time.sleep(0.5) #发生异常时 延时0.5s 避免访问过于频繁
  31. html = get_page(url)
  32. else:
  33. time.sleep(0.1) #没有发生异常时 延时0.1s
  34. #解析响应内容
  35. comments = parse_page(html)
  36. #获取末尾评论的时间
  37. start_time = comments[14]['startTime']
  38. #把时间从str转换为datetime类型 减去1s 避免爬取重复数据
  39. start_time = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
  40. #再把时间转换为字符串形式
  41. start_time = datetime.strftime(start_time,'%Y-%m-%d %H:%M:%S')
  42. #保存数据
  43. write_tofile(comments)

我们主要爬取18年到19年这一年的影评,所以我们从起始时间开始,每隔1s刷新一下页面,进行爬取。

连接mysql,创建数据库和表:

  1. def create_db():
  2. # 创建maoyan数据库
  3. db = pymysql.connect(host='localhost', user='root', password='', port=3306)
  4. cursor = db.cursor()
  5. cursor.execute('CREATE DATABASE IF NOT EXISTS maoyan DEFAULT CHARACTER SET utf8')
  6. # 连接maoyan数据库
  7. db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan')
  8. cursor = db.cursor()
  9. cursor.execute("DROP TABLE IF EXISTS comments") #删除所有的表
  10. #创建comments表以及各个字段
  11. cursor.execute(
  12. 'CREATE TABLE IF NOT EXISTS comments (id int NOT NULL auto_increment, uid VARCHAR(255), cityName VARCHAR(255) , content VARCHAR(255), score float ,startTime datetime, PRIMARY KEY (id))')
  13. db.close()

发送请求,获取响应,编写get_page(url)函数:

  1. def get_page(url):
  2. #添加User-Agent,放在headers中,伪装成浏览器
  3. headers = {
  4. 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
  5. }
  6. response = requests.get(url,headers=headers)
  7. if response.status_code == 200:
  8. response.encoding = response.apparent_encoding
  9. return response.text
  10. return None

解析响应,之前都是用正则表达式来解析HTML代码,现在是json格式,很容易解析,因为其内部都是一些键值对,可以像字典一样访问:

  1. def parse_page(html):
  2. data = json.loads(html)['cmts'] #将str转换为json cmts是最外层的键名
  3. comments = []
  4. for item in data:
  5. comment = {
  6. 'id':item['id'],
  7. 'cityName':item['cityName'] if 'cityName' in item else '',
  8. 'content':item['content'].replace('\n',' ').replace('\r',' ').replace('\t',' ') #处理评论内容换行的情况
  9. 'score':item['score'],
  10. 'startTime':item['startTime']
  11. }
  12. comments.append(comment)
  13. return comments

将数据存储到Mysql数据库:

  1. def write_tofile(comments):
  2. db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan') #连接Mysql maoyan数据库
  3. cursor = db.cursor()
  4. #插入数据 存储
  5. sql = 'INSERT INTO comments(uid, cityName, content, score, startTime) values (%s, %s, %s, %s, %s)'
  6. for item in comments:
  7. try:
  8. cursor.execute(sql,list(item.values()))
  9. db.commit()
  10. except:
  11. db.rollback()
  12. db.close()

爬取效果,首先确保已经安装好MySql数据库和可视化管理工具Navicat:

完整代码:

  1. import requests
  2. import json
  3. import time
  4. from datetime import datetime
  5. from datetime import timedelta
  6. import pymysql
  7. def get_page(url):
  8. #添加User-Agent,放在headers中,伪装成浏览器
  9. headers = {
  10. 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
  11. }
  12. response = requests.get(url,headers=headers)
  13. if response.status_code == 200:
  14. response.encoding = 'utf-8'
  15. return response.text
  16. return None
  17. def parse_page(html):
  18. data = json.loads(html)['cmts'] #将str转换为json cmts是最外层的键名
  19. comments = []
  20. for item in data:
  21. comment = {
  22. 'id':item['id'],
  23. 'cityName':item['cityName'] if 'cityName' in item else '',
  24. 'content':item['content'].replace('\n',' ').replace('\r',' ').replace('\t',' '), #处理评论内容换行的情况
  25. 'score':item['score'],
  26. 'startTime':item['startTime']
  27. }
  28. comments.append(comment)
  29. return comments
  30. def write_tofile(comments):
  31. db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan') #连接Mysql maoyan数据库
  32. cursor = db.cursor()
  33. #插入数据 存储
  34. sql = 'INSERT INTO comments(uid, cityName, content, score, startTime) values (%s, %s, %s, %s, %s)'
  35. for item in comments:
  36. try:
  37. cursor.execute(sql,list(item.values()))
  38. db.commit()
  39. except:
  40. db.rollback()
  41. db.close()
  42. def create_db():
  43. # 创建maoyan数据库
  44. db = pymysql.connect(host='localhost', user='root', password='', port=3306)
  45. cursor = db.cursor()
  46. cursor.execute('CREATE DATABASE IF NOT EXISTS maoyan DEFAULT CHARACTER SET utf8')
  47. # 连接maoyan数据库
  48. db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='maoyan')
  49. cursor = db.cursor()
  50. cursor.execute("DROP TABLE IF EXISTS comments") #删除所有的表
  51. #创建comments表以及各个字段
  52. cursor.execute(
  53. 'CREATE TABLE IF NOT EXISTS comments (id int NOT NULL auto_increment, uid VARCHAR(255), cityName VARCHAR(255) , content VARCHAR(255), score float ,startTime datetime, PRIMARY KEY (id))')
  54. db.close()
  55. if __name__ == '__main__':
  56. create_db() #创建mysql数据库
  57. '''
  58. 海王影评接口
  59. url = 'http://m.maoyan.com/mmdb/comments/movie/249342.json?_v_=yes&offset=0&startTime=2019-01-01%2000:00:00'
  60. '''
  61. id = '249342' #海王电影id
  62. start_time = '2019-01-01 00:00:00' #开始时间
  63. end_time = '2018-01-01 00:00:00' #结束时间
  64. while start_time > end_time:
  65. url = 'http://m.maoyan.com/mmdb/comments/movie/'+id+'.json?_v_=yes&offset=0&startTime='+start_time.replace(' ','%20') #将空格替换为%20
  66. #发送请求,获取响应
  67. try:
  68. html = get_page(url)
  69. except Exception:
  70. time.sleep(0.5) #发生异常时 延时0.5s 避免访问过于频繁
  71. html = get_page(url)
  72. else:
  73. time.sleep(0.1) #没有发生异常时 延时0.1s
  74. #解析响应内容
  75. comments = parse_page(html)
  76. #获取末尾评论的时间
  77. start_time = comments[14]['startTime']
  78. #把时间从str转换为datetime类型 减去1s 避免爬取重复数据
  79. start_time = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
  80. #再把时间转换为字符串形式
  81. start_time = datetime.strftime(start_time,'%Y-%m-%d %H:%M:%S')
  82. #保存数据
  83. write_tofile(comments)

 

 

 

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/141395
推荐阅读
相关标签
  

闽ICP备14008679号