赞
踩
按照我们老师要求,用python做一个关于数据分析的小项目
——基于baidu的编程haha
我选的是·爬取豆瓣Top250 数据,然后分析豆瓣用户观影喜好
PPT部分截图:
ppt不知道怎么上传 =.= 需要留言
爬虫部分代码:
#-*- coding: utf-8 -*- import io import sys from urllib.request import urlopen from bs4 import BeautifulSoup from collections import defaultdict import pandas as pd import time import re from multiprocessing import pool, Pool class DoubanMovieTop(): sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') def __init__(self): # 得到url地址,分析分页规律, self.top_urls = ['https://movie.douban.com/top250?start={0}&filter='.format(x*25) for x in range(10)] self.data = defaultdict(list) self.columns = ['title', 'link', 'score', 'score_cnt', 'top_no', 'director', 'writers', 'actors', 'types', 'edit_location', 'language', 'dates', 'play_location', 'length', 'rating_per', 'betters', 'had_seen', 'want_see', 'tags', 'short_review', 'review', 'ask', 'discussion'] self.df = None def get_bsobj(self, url): html = urlopen(url).read().decode('utf-8') #将字节转字符串 bsobj = BeautifulSoup(html, 'lxml') return bsobj def get_info(self): for url in self.top_urls: bsobj = self.get_bsobj(url) main = bsobj.find('ol', {'class': 'grid_view'}) # 标题及链接信息 title_objs = main.findAll('div', {'class': 'hd'}) titles = [i.find('span').text for i in title_objs] links = [i.find('a')['href'] for i in title_objs] # 评分信息 score_objs = main.findAll('div', {'class': 'star'}) scores = [i.find('span', {'class': 'rating_num'}).text for i in score_objs] score_cnts = [i.findAll('span')[-1].text for i in score_objs] for title, link, score, score_cnt in zip(titles, links, scores, score_cnts): self.data[title].extend([title, link, score, score_cnt]) bsobj_more = self.get_bsobj(link) more_data = self.get_more_info(bsobj_more) self.data[title].extend(more_data) print(self.data[title]) print(len(self.data)) time.sleep(0) def get_more_info(self, bsobj): # 榜单排名 top_no = bsobj.find('span', {'class': 'top250-no'}).text.split('.&
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。