赞
踩
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import io
import base64
import os
import folium
import folium.plugins
import matplotlib.pyplot as plt
from matplotlib import rc,animation
from mpl_toolkits.mplot3d import Axes3D
from wordcloud import WordCloud,STOPWORDS
matches = pd.read_csv('./WorldCupMatches.csv')
players = pd.read_csv('./WorldCupPlayers.csv')
cups = pd.read_csv('./WorldCups.csv')
matches.head(3)
Year | Datetime | Stage | Stadium | City | Home Team Name | Home Team Goals | Away Team Goals | Away Team Name | Win conditions | Attendance | Half-time Home Goals | Half-time Away Goals | Referee | Assistant 1 | Assistant 2 | RoundID | MatchID | Home Team Initials | Away Team Initials | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1930.0 | 13 Jul 1930 - 15:00 | Group 1 | Pocitos | Montevideo | France | 4.0 | 1.0 | Mexico | 4444.0 | 3.0 | 0.0 | LOMBARDI Domingo (URU) | CRISTOPHE Henry (BEL) | REGO Gilberto (BRA) | 201.0 | 1096.0 | FRA | MEX | |
1 | 1930.0 | 13 Jul 1930 - 15:00 | Group 4 | Parque Central | Montevideo | USA | 3.0 | 0.0 | Belgium | 18346.0 | 2.0 | 0.0 | MACIAS Jose (ARG) | MATEUCCI Francisco (URU) | WARNKEN Alberto (CHI) | 201.0 | 1090.0 | USA | BEL | |
2 | 1930.0 | 14 Jul 1930 - 12:45 | Group 2 | Parque Central | Montevideo | Yugoslavia | 2.0 | 1.0 | Brazil | 24059.0 | 2.0 | 0.0 | TEJADA Anibal (URU) | VALLARINO Ricardo (URU) | BALWAY Thomas (FRA) | 201.0 | 1093.0 | YUG | BRA |
players.head(3)
RoundID | MatchID | Team Initials | Coach Name | Line-up | Shirt Number | Player Name | Position | Event | |
---|---|---|---|---|---|---|---|---|---|
0 | 201 | 1096 | FRA | CAUDRON Raoul (FRA) | S | 0 | Alex THEPOT | GK | NaN |
1 | 201 | 1096 | MEX | LUQUE Juan (MEX) | S | 0 | Oscar BONFIGLIO | GK | NaN |
2 | 201 | 1096 | FRA | CAUDRON Raoul (FRA) | S | 0 | Marcel LANGILLER | NaN | G40' |
cups.head(3)
Year | Country | Winner | Runners-Up | Third | Fourth | GoalsScored | QualifiedTeams | MatchesPlayed | Attendance | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1930 | Uruguay | Uruguay | Argentina | USA | Yugoslavia | 70 | 13 | 18 | 590.549 |
1 | 1934 | Italy | Italy | Czechoslovakia | Germany | Austria | 70 | 16 | 17 | 363.000 |
2 | 1938 | France | Italy | Hungary | Brazil | Sweden | 84 | 15 | 18 | 375.700 |
先去掉世界杯比赛中Attendance字段的重复数据,根据Year字段对其进行累加,再使用Seaborn和Matplotlib可视化
matches.isnull().sum()
sns.set_style("darkgrid")
matches = matches.drop_duplicates(subset="MatchID",keep="first")
matches = matches[matches["Year"].notnull()]
att = matches.groupby("Year")["Attendance"].sum().reset_index()
att["Year"] = att["Year"].astype(int)
plt.figure(figsize=(12,7))
sns.barplot(att["Year"],att["Attendance"],linewidth=1,edgecolor="k"*len(att))
plt.grid(True)
plt.title("Attendence by year",color='b')
plt.show()
考虑到赛制改变等因素,历届世界杯的比赛场次数量存在一定的差异,进一步计算每届世界杯观众数的平均值,进一步分析历届世界杯的影响力
att1 = matches.groupby("Year")["Attendance"].mean().reset_index()
att1["Year"] = att1["Year"].astype(int)
plt.figure(figsize=(12,7))
ax = sns.pointplot(att1["Year"],att1["Attendance"],color="w")
ax.set_facecolor("k")
plt.grid(True,color="grey",alpha=.3)
plt.title("Average attendence by year",color='b')
plt.show()
总体呈上升趋势,1994年最高,2006-2014年总人数稳定在较高水平
接下来,计算各个比赛城市的平均观众人数,并用可视化的形式展示平均值最高的20个城市。
ct_at = matches.groupby("City")["Attendance"].mean().reset_index()
ct_at = ct_at.sort_values(by="Attendance",ascending=False)
plt.figure(figsize=(10,10))
ax = sns.barplot("Attendance","City",
data=ct_at[:20],
linewidth = 1,
edgecolor = "k"*20,
palette = "Spectral_r")
for i,j in enumerate(" Average attendance : "+np.around(ct_at["Attendance"][:20],0).astype(str)):ax.text(.7,i,j,fontsize=12)
plt.grid(True)
plt.title("Average attendance by city",color='b')
plt.show()
一个城市可能有多个场馆,各个场馆的观众数可能也不相同:计算各个场馆的平均观众人数,并取观众数最多的14个场馆进行可视化,与上面的结果略有出入。
matches["Year"] = matches["Year"].astype(int)#学习这种转化数据类型的方式,机器学习会用到
matches["Datetime"] = matches["Datetime"].str.split("-").str[0]
matches["Stadium"] = matches["Stadium"].str.replace('Estadio do Maracana',"Maracanã Stadium")
matches["Stadium"] = matches["Stadium"].str.replace('Maracan� - Est�dio Jornalista M�rio Filho',"Maracanã Stadium")
std = matches.groupby(["Stadium","City"])["Attendance"].mean().reset_index().sort_values(by = "Attendance",ascending =False)
plt.figure(figsize=(8
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。