当前位置:   article > 正文

实验2 探索性数据分析和可视化_一个包含书籍名称、评分、评分人数、价格等信息的数据集,饼图:绘制不同评分区间(

一个包含书籍名称、评分、评分人数、价格等信息的数据集,饼图:绘制不同评分区间(

实验内容:基于给定的图书出版数据完成以下任务(book_list.csv):

1. 根据数据分析和可视化需求对给定的图书数据进行预处理,构建新的数据文件processedBookInfo.csv对处理后的数据进行保存。

实验代码:

  1. import csv
  2. # 数据读取
  3. r_file = open("book_list.csv", 'r')
  4. bookInfo = csv.reader(r_file)
  5. w_file = open("processedBookInfo.csv", 'w', newline='')
  6. csvwriter = csv.writer(w_file)
  7. csvwriter.writerow(['序号', '书名', '评分', '评价人数', '作者', '出版社', '出版年份', '价格', '货币单位', '人民币价格'])
  8. # 处理数据
  9. for line in bookInfo:
  10. pubInfo = line[5]
  11. bookName = line[1]
  12. bookAuthor = line[4]
  13. publishInfo = pubInfo.split('/')
  14. if len(pubInfo) > 8:
  15. # 出版社名字
  16. publish = publishInfo[0]
  17. publisherName = publish[6:].strip()
  18. # 出版时间
  19. publishDate = publishInfo[1].strip()
  20. publishYear = publishDate[:4] # 出版年份
  21. # 价格
  22. bookPrice = publishInfo[2]
  23. currencyUnit = "元" # 货币单位,默认为元
  24. RMBPrice = bookPrice # 人民币价格
  25. if "元" in bookPrice or "CNY" in bookPrice:
  26. bookPrice = bookPrice.replace("元", "").strip()
  27. bookPrice = bookPrice.replace("CNY", "").strip()
  28. RMBPrice = bookPrice
  29. currencyUnit = "元"
  30. if "$" in bookPrice or "USD" in bookPrice:
  31. bookPrice = bookPrice.replace("$", "").strip()
  32. bookPrice = bookPrice.replace("USD", "").strip()
  33. currencyUnit = "美元"
  34. RMBPrice = float(bookPrice) * 7
  35. print([line[0], line[1], line[2], line[3], publisherName, publishYear, bookPrice, currencyUnit])
  36. # 写入数据
  37. csvwriter.writerow([line[0], line[1], line[2], line[3], line[4], publisherName, publishYear, bookPrice, currencyUnit,RMBPrice])
  38. r_file.close()
  39. w_file.close()

运行结果:

2. 使用折线图对清华大学出版社、电子工业出版社和人民邮电出版社在[2005,2015]年间每年出版图书量的变化情况进行分析。

实验代码:

  1. import csv
  2. import matplotlib.pyplot as plt
  3. # 设置字体,防止中文乱码
  4. plt.rcParams['font.family'] = 'sans-serif'
  5. plt.rcParams['font.sans-serif'] = [u'SimHei']
  6. # 数据读取
  7. file = open("processedBookInfo.csv")
  8. cont = csv.reader(file)
  9. next(cont) # 跳过标题行
  10. bookPublisher1_dict = {} # 清华大学出版社
  11. bookPublisher2_dict = {} # 电子工业出版社
  12. bookPublisher3_dict = {} # 人民邮电出版社
  13. # 统计数据
  14. # 防止某年份出版图书为0,导致画图时x与y个数不一致出错
  15. year = ['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
  16. for i in year:
  17. bookPublisher1_dict[i] = 0
  18. bookPublisher2_dict[i] = 0
  19. bookPublisher3_dict[i] = 0
  20. for line in cont:
  21. publisher = line[5] # 出版社
  22. pubYear = line[6] # 出版年份
  23. if int(pubYear) >= 2005 and int(pubYear) <= 2015:
  24. if publisher in '清华大学出版社':
  25. bookPublisher1_dict[pubYear] += 1
  26. if publisher in '电子工业出版社':
  27. bookPublisher2_dict[pubYear] += 1
  28. if publisher in '人民邮电出版社':
  29. bookPublisher3_dict[pubYear] += 1
  30. # 按字典value进行排序(升序),调整顺序
  31. bookPublisher1 = {key: bookPublisher1_dict[key] for key in sorted(bookPublisher1_dict)}
  32. bookPublisher2 = {key: bookPublisher2_dict[key] for key in sorted(bookPublisher2_dict)}
  33. bookPublisher3 = {key: bookPublisher3_dict[key] for key in sorted(bookPublisher3_dict)}
  34. years = [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
  35. bookPublisher1 = bookPublisher1.values()
  36. bookPublisher2 = bookPublisher2.values()
  37. bookPublisher3 = bookPublisher3.values()
  38. # 绘制折线图
  39. plt.plot(years, bookPublisher1, color='r', marker='*')
  40. plt.plot(years, bookPublisher2, color='b', marker='o')
  41. plt.plot(years, bookPublisher3, color='c', marker='v', linestyle='--')
  42. plt.xlabel('出版年份', fontsize=14)
  43. plt.ylabel('出版书籍数量', fontsize=14)
  44. plt.title('出版社出版量统计', fontsize=14)
  45. plt.yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], fontproperties='SimHei', fontsize=14)
  46. plt.legend(['清华大学出版社', '电子工业出版社', '人民邮电出版社'], fontsize=12)
  47. plt.show()
  48. file.close()

 运行结果:

3. 对出版量前五的出版社优秀图书的分布情况进行分析,要求使用气泡图:x,y和size分别为出版社、评分和评价数量;优秀图书的评价标准:评分在8分以上,评价数量在100以上。

实验代码:

  1. import csv
  2. import matplotlib.pyplot as plt
  3. import numpy as np
  4. # 设置字体,防止中文乱码
  5. plt.rcParams['font.family'] = ['sans-serif']
  6. plt.rcParams['font.sans-serif'] = ['SimHei']
  7. # 数据读取
  8. file = open("processedBookInfo.csv")
  9. cont = csv.reader(file)
  10. next(cont)
  11. publisherCount = {}
  12. # 统计每个出版社的出版数量
  13. for line in cont:
  14. publisher = line[5]
  15. bookRate = float(line[2])
  16. bookCommentNumber = int(line[3])
  17. if publisher in publisherCount:
  18. publisherCount[publisher]['pubNum'] = publisherCount[publisher]['pubNum'] + 1
  19. else:
  20. publisherCount[publisher] = {'pubNum': 1, 'goodPub': []}
  21. if bookRate > 8 and bookCommentNumber > 100:
  22. publisherCount[publisher]['goodPub'].append((bookRate, bookCommentNumber))
  23. # dict.items()格式:('天津人民出版社', {'pubNum': 1, 'goodPub': [(8.4, 457),(...),...]}),....
  24. publisherCount = sorted(publisherCount.items(), key=lambda x: x[1]['pubNum'], reverse=True)
  25. publisherCount = publisherCount[:5]
  26. # print(publisherCount)
  27. # 构建3个坐标轴的数据
  28. bookPublisher = [] # 出版社
  29. bookRate = [] # 评分
  30. bookCommentsNumber = [] # 评论数量
  31. for data in publisherCount:
  32. for info in data[1]['goodPub']: # 注:data[1]['goodPub']取到的就是列表[(8.4, 457),(...),...],所以info为列表中的一个元组
  33. # print(info)
  34. bookPublisher.append(data[0])
  35. bookRate.append(info[0])
  36. bookCommentsNumber.append(info[1])
  37. # print(bookPublisher)
  38. # print(bookRate)
  39. # print(bookCommentsNumber)
  40. # print(len(bookPublisher))
  41. # print(len(bookRate))
  42. # print(len(bookCommentsNumber))
  43. print("最大评论数:", max(bookCommentsNumber))
  44. print("最小评论数:", min(bookCommentsNumber))
  45. # 进行规范化处理 [102, 22914]->[0, 100] ,[L,R]->[l,r]
  46. # v=(x-l)*(R-L)/(r-l) + L
  47. # v=(x-102)* 100/22812
  48. bookCommentsNumber = np.divide((np.array(bookCommentsNumber)-102)*100, 22812)
  49. # 不进行规范化处理
  50. # bookCommentsNumber = np.array(bookCommentsNumber)
  51. plt.figure(figsize=(8, 6))
  52. # 绘图
  53. plt.scatter(bookPublisher, bookRate, c='r', s=bookCommentsNumber)
  54. plt.xlabel("出版社")
  55. plt.ylabel("图书评分")
  56. plt.show()
  57. file.close()

  运行结果:

4. 使用饼状图各图书评分区间图书数量分布情况进行统计:[6,7),[7,8),[8,9),[9,10),要求显示图例并对比例最高的部分进行突出显示

实验代码:

  1. # 使用饼状图对各图书评分区间的图书数量分布情况进行统计:
  2. # [6,7),[7,8),[8,9),[9,10)
  3. # 要求显示图例并对比例最高的部分进行突出显示
  4. import csv
  5. import matplotlib.pyplot as plt
  6. # 设置中文字体,防止中文乱码
  7. plt.rcParams['font.family'] = ['sans-serif']
  8. plt.rcParams['font.sans-serif'] = ['SimHei']
  9. # 数据读取
  10. file = open("processedBookInfo.csv")
  11. cont = csv.reader(file)
  12. next(cont) # 跳过表头
  13. count1 = 0 # 评分[6,7)之间的图书数量
  14. count2 = 0 # 评分[7,8)之间的图书数量
  15. count3 = 0 # 评分[8,9)之间的图书数量
  16. count4 = 0 # 评分[9,10)之间的图书数量
  17. # 统计各图书评分区间的图书数量
  18. for line in cont:
  19. grade = float(line[2])
  20. if 6 <= grade < 7:
  21. count1 += 1
  22. elif grade < 8:
  23. count2 += 1
  24. elif grade < 9:
  25. count3 += 1
  26. elif grade < 10:
  27. count4 += 1
  28. label = ['[6,7)', '[7,8)', '[8,9)', '[9,10)'] # 各类别标签
  29. color = ['greenyellow', 'lightcyan', 'lightcoral', 'moccasin'] # 各类别颜色
  30. size = [count1, count2, count3, count4] # 各类别数量
  31. explode = (0, 0, 0.1, 0) # 各类别的偏移半径,各部分离开中心点的距离
  32. # 绘制饼状图, autopct在饼状图中显示出百分比
  33. pie = plt.pie(size, colors=color, explode=explode, labels=label, autopct='%1.1f%%')
  34. # 饼状图呈正圆
  35. for font in pie[1]: # pie[1]:l_text,pie图外的文本
  36. font.set_size(10) # 设置标签字体大小
  37. for digit in pie[2]: # pie[2]:p_text,pie图内的文本
  38. digit.set_size(12) # 设置百分比字体的大小
  39. plt.axis('equal')
  40. plt.title(u'各图书评分区间的图书数量', fontsize=12)
  41. # 显示图例
  42. plt.legend(bbox_to_anchor=(0.82, 1), prop='SimHei') # 图例
  43. plt.show()
  44. file.close()

  运行结果:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/运维做开发/article/detail/874877
推荐阅读
相关标签
  

闽ICP备14008679号