赞
踩
# _*_ coding:UTF-8 _*_
import os
import requests
import time
from bs4 import BeautifulSoup
import sys
import re ##正则表达式
import urllib.request
import io
os.chdir(r'D:\python')
################################################################爬取《寄生虫》电影短评
def get_urls(urls , num):
req = urllib.request.urlopen(urls)
html = BeautifulSoup(req , features='html.parser') ##若前面为requests.get()函数则需要改为req.text
for i in range(0,20):
#print(i + num*20)
Review = html.find_all('span', class_='short')[i].text
Review = (str(bytes(Review, encoding='utf-8').decode('utf-8').encode('gbk', 'ignore').decode('gbk')))
##str不具有decode属性,必须先将其转换为bytes,转化为bytes要制定其编码,然后将其utf8解码然后再编码成gbk,
# 同时备注‘ignore’属性,忽视无法编码的emoji,最后解码然后转换成str,便可输出到txt
print(Review)
#将输出重定向到txt文件
output=sys.stdout ## 创建定向输出程序
outputfile=open("寄生虫短评.txt",'w',encoding='utf-8') ## 创建文件夹
sys.stdout=outputfile ## 定向输出txt文件
k=0
while k <= 10:
urls = 'https://movie.douban.com/subject/27010768/comments?start=' + str(k*20) + '&limit=20&sort=new_score&status=P'
get_urls(urls , k)
time.sleep(2)
k += 1
outputfile.close()
sys.stdout=output
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。