赞
踩
实验内容:基于给定的图书出版数据完成以下任务(book_list.csv):
1. 根据数据分析和可视化需求对给定的图书数据进行预处理,构建新的数据文件processedBookInfo.csv对处理后的数据进行保存。
实验代码:
- import csv
-
- # 数据读取
- r_file = open("book_list.csv", 'r')
- bookInfo = csv.reader(r_file)
-
- w_file = open("processedBookInfo.csv", 'w', newline='')
- csvwriter = csv.writer(w_file)
- csvwriter.writerow(['序号', '书名', '评分', '评价人数', '作者', '出版社', '出版年份', '价格', '货币单位', '人民币价格'])
-
- # 处理数据
- for line in bookInfo:
- pubInfo = line[5]
- bookName = line[1]
- bookAuthor = line[4]
- publishInfo = pubInfo.split('/')
-
- if len(pubInfo) > 8:
- # 出版社名字
- publish = publishInfo[0]
- publisherName = publish[6:].strip()
- # 出版时间
- publishDate = publishInfo[1].strip()
- publishYear = publishDate[:4] # 出版年份
- # 价格
- bookPrice = publishInfo[2]
- currencyUnit = "元" # 货币单位,默认为元
- RMBPrice = bookPrice # 人民币价格
- if "元" in bookPrice or "CNY" in bookPrice:
- bookPrice = bookPrice.replace("元", "").strip()
- bookPrice = bookPrice.replace("CNY", "").strip()
- RMBPrice = bookPrice
- currencyUnit = "元"
- if "$" in bookPrice or "USD" in bookPrice:
- bookPrice = bookPrice.replace("$", "").strip()
- bookPrice = bookPrice.replace("USD", "").strip()
- currencyUnit = "美元"
- RMBPrice = float(bookPrice) * 7
-
- print([line[0], line[1], line[2], line[3], publisherName, publishYear, bookPrice, currencyUnit])
-
- # 写入数据
- csvwriter.writerow([line[0], line[1], line[2], line[3], line[4], publisherName, publishYear, bookPrice, currencyUnit,RMBPrice])
-
- r_file.close()
- w_file.close()
运行结果:
2. 使用折线图对清华大学出版社、电子工业出版社和人民邮电出版社在[2005,2015]年间每年出版图书量的变化情况进行分析。
实验代码:
- import csv
- import matplotlib.pyplot as plt
-
- # 设置字体,防止中文乱码
- plt.rcParams['font.family'] = 'sans-serif'
- plt.rcParams['font.sans-serif'] = [u'SimHei']
-
- # 数据读取
- file = open("processedBookInfo.csv")
- cont = csv.reader(file)
- next(cont) # 跳过标题行
- bookPublisher1_dict = {} # 清华大学出版社
- bookPublisher2_dict = {} # 电子工业出版社
- bookPublisher3_dict = {} # 人民邮电出版社
-
- # 统计数据
- # 防止某年份出版图书为0,导致画图时x与y个数不一致出错
- year = ['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
- for i in year:
- bookPublisher1_dict[i] = 0
- bookPublisher2_dict[i] = 0
- bookPublisher3_dict[i] = 0
-
- for line in cont:
- publisher = line[5] # 出版社
- pubYear = line[6] # 出版年份
- if int(pubYear) >= 2005 and int(pubYear) <= 2015:
- if publisher in '清华大学出版社':
- bookPublisher1_dict[pubYear] += 1
- if publisher in '电子工业出版社':
- bookPublisher2_dict[pubYear] += 1
- if publisher in '人民邮电出版社':
- bookPublisher3_dict[pubYear] += 1
-
- # 按字典value进行排序(升序),调整顺序
- bookPublisher1 = {key: bookPublisher1_dict[key] for key in sorted(bookPublisher1_dict)}
- bookPublisher2 = {key: bookPublisher2_dict[key] for key in sorted(bookPublisher2_dict)}
- bookPublisher3 = {key: bookPublisher3_dict[key] for key in sorted(bookPublisher3_dict)}
-
- years = [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
- bookPublisher1 = bookPublisher1.values()
- bookPublisher2 = bookPublisher2.values()
- bookPublisher3 = bookPublisher3.values()
-
- # 绘制折线图
- plt.plot(years, bookPublisher1, color='r', marker='*')
- plt.plot(years, bookPublisher2, color='b', marker='o')
- plt.plot(years, bookPublisher3, color='c', marker='v', linestyle='--')
- plt.xlabel('出版年份', fontsize=14)
- plt.ylabel('出版书籍数量', fontsize=14)
- plt.title('出版社出版量统计', fontsize=14)
- plt.yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], fontproperties='SimHei', fontsize=14)
- plt.legend(['清华大学出版社', '电子工业出版社', '人民邮电出版社'], fontsize=12)
- plt.show()
- file.close()
运行结果:
3. 对出版量前五的出版社的优秀图书的分布情况进行分析,要求使用气泡图:x,y和size分别为出版社、评分和评价数量;优秀图书的评价标准:评分在8分以上,评价数量在100以上。
实验代码:
- import csv
- import matplotlib.pyplot as plt
- import numpy as np
-
- # 设置字体,防止中文乱码
- plt.rcParams['font.family'] = ['sans-serif']
- plt.rcParams['font.sans-serif'] = ['SimHei']
-
- # 数据读取
- file = open("processedBookInfo.csv")
- cont = csv.reader(file)
- next(cont)
-
- publisherCount = {}
- # 统计每个出版社的出版数量
- for line in cont:
- publisher = line[5]
- bookRate = float(line[2])
- bookCommentNumber = int(line[3])
- if publisher in publisherCount:
- publisherCount[publisher]['pubNum'] = publisherCount[publisher]['pubNum'] + 1
- else:
- publisherCount[publisher] = {'pubNum': 1, 'goodPub': []}
- if bookRate > 8 and bookCommentNumber > 100:
- publisherCount[publisher]['goodPub'].append((bookRate, bookCommentNumber))
-
- # dict.items()格式:('天津人民出版社', {'pubNum': 1, 'goodPub': [(8.4, 457),(...),...]}),....
- publisherCount = sorted(publisherCount.items(), key=lambda x: x[1]['pubNum'], reverse=True)
- publisherCount = publisherCount[:5]
- # print(publisherCount)
-
- # 构建3个坐标轴的数据
- bookPublisher = [] # 出版社
- bookRate = [] # 评分
- bookCommentsNumber = [] # 评论数量
-
- for data in publisherCount:
- for info in data[1]['goodPub']: # 注:data[1]['goodPub']取到的就是列表[(8.4, 457),(...),...],所以info为列表中的一个元组
- # print(info)
- bookPublisher.append(data[0])
- bookRate.append(info[0])
- bookCommentsNumber.append(info[1])
-
- # print(bookPublisher)
- # print(bookRate)
- # print(bookCommentsNumber)
- # print(len(bookPublisher))
- # print(len(bookRate))
- # print(len(bookCommentsNumber))
- print("最大评论数:", max(bookCommentsNumber))
- print("最小评论数:", min(bookCommentsNumber))
-
- # 进行规范化处理 [102, 22914]->[0, 100] ,[L,R]->[l,r]
- # v=(x-l)*(R-L)/(r-l) + L
- # v=(x-102)* 100/22812
- bookCommentsNumber = np.divide((np.array(bookCommentsNumber)-102)*100, 22812)
- # 不进行规范化处理
- # bookCommentsNumber = np.array(bookCommentsNumber)
-
- plt.figure(figsize=(8, 6))
- # 绘图
- plt.scatter(bookPublisher, bookRate, c='r', s=bookCommentsNumber)
- plt.xlabel("出版社")
- plt.ylabel("图书评分")
- plt.show()
- file.close()
运行结果:
4. 使用饼状图对各图书评分区间的图书数量分布情况进行统计:[6,7),[7,8),[8,9),[9,10),要求显示图例并对比例最高的部分进行突出显示。
实验代码:
- # 使用饼状图对各图书评分区间的图书数量分布情况进行统计:
- # [6,7),[7,8),[8,9),[9,10)
- # 要求显示图例并对比例最高的部分进行突出显示
-
- import csv
- import matplotlib.pyplot as plt
-
- # 设置中文字体,防止中文乱码
- plt.rcParams['font.family'] = ['sans-serif']
- plt.rcParams['font.sans-serif'] = ['SimHei']
-
- # 数据读取
- file = open("processedBookInfo.csv")
- cont = csv.reader(file)
- next(cont) # 跳过表头
-
- count1 = 0 # 评分[6,7)之间的图书数量
- count2 = 0 # 评分[7,8)之间的图书数量
- count3 = 0 # 评分[8,9)之间的图书数量
- count4 = 0 # 评分[9,10)之间的图书数量
-
- # 统计各图书评分区间的图书数量
- for line in cont:
- grade = float(line[2])
- if 6 <= grade < 7:
- count1 += 1
- elif grade < 8:
- count2 += 1
- elif grade < 9:
- count3 += 1
- elif grade < 10:
- count4 += 1
-
- label = ['[6,7)', '[7,8)', '[8,9)', '[9,10)'] # 各类别标签
- color = ['greenyellow', 'lightcyan', 'lightcoral', 'moccasin'] # 各类别颜色
- size = [count1, count2, count3, count4] # 各类别数量
- explode = (0, 0, 0.1, 0) # 各类别的偏移半径,各部分离开中心点的距离
- # 绘制饼状图, autopct在饼状图中显示出百分比
- pie = plt.pie(size, colors=color, explode=explode, labels=label, autopct='%1.1f%%')
- # 饼状图呈正圆
- for font in pie[1]: # pie[1]:l_text,pie图外的文本
- font.set_size(10) # 设置标签字体大小
- for digit in pie[2]: # pie[2]:p_text,pie图内的文本
- digit.set_size(12) # 设置百分比字体的大小
- plt.axis('equal')
- plt.title(u'各图书评分区间的图书数量', fontsize=12)
- # 显示图例
- plt.legend(bbox_to_anchor=(0.82, 1), prop='SimHei') # 图例
- plt.show()
- file.close()
运行结果:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。