当前位置:   article > 正文

python 数据分析_phython处理数据demo

phython处理数据demo

数据分组(按照定类数据对定量数据进行切割,并输出到指定路径)

  1. import pandas as pd
  2. df = pd.read_csv('hospital2_data1_pro.csv')
  3. grouped = df.groupby(['既往应用节育器情况', '使用节育器型号情况'])
  4. for group_name, group_data in grouped:
  5. file_name = f"{group_name[0]}_{group_name[1]}.xlsx"
  6. file_path = f"D:\\Desktop\\{file_name}"
  7. group_data.to_excel(file_path, index=False)

快速数据处理(剔除、变换)

  1. mask = (df['Delivering city'] == DC) & (df['Receiving city'] == RC)
  2. df = df.loc[mask]

快速数据处理_时间(提取)

  1. import pandas as pd
  2. startTime = '2022-07-01'
  3. endTime = '2022-09-30'
  4. start_date = pd.to_datetime(startTime)
  5. end_date = pd.to_datetime(endTime)
  6. mask = (df['Delivering city'] >= start_date) & (df['Receiving city'] <= end_date)
  7. df = df.loc[mask]
  8. df.to_csv('result.csv',index = False)

折线图小技巧

  1. plt.xticks(rotation = 90,fontsize = 12)
  2. plt.xticks(range(1,len(ind),5))
  3. plt.legend(fontsize=18)
  4. plt.text(maxs,tAveData[maxs],round(PreThreshold[maxs],2),fontsize = 20) #保留两位小数

缺失值处理

  1. import pandas as pd
  2. df = pd.read_csv('anormal.csv')
  3. missing_values = df.isna().sum()
  4. missing_value_ratio = (missing_values / len(df)).round(4) * 100
  5. for column, ratio in missing_value_ratio.items():
  6. print(f"{column}: {ratio:.2f}%")
  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. import matplotlib
  4. matplotlib.rc("font", family='Microsoft YaHei')
  5. df = pd.read_excel("附件.xlsx")
  6. missing_counts = df.isnull().sum()
  7. total_rows = len(df)
  8. non_missing_percentages = (1 - (missing_counts / total_rows)) * 100
  9. colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6', '#ffdd99', '#ff6666', '#c2f0c2', '#e6ac00']
  10. plt.figure(figsize=(10, 6))
  11. plt.bar(range(len(non_missing_percentages)), non_missing_percentages,color = colors)
  12. plt.xticks(range(len(non_missing_percentages)), df.columns, rotation=45, ha="right")
  13. for i, percentage in enumerate(non_missing_percentages):
  14. plt.text(i, percentage + 1, f"{percentage:.0f}%", ha="center")
  15. plt.tight_layout()
  16. plt.show()
一、频数直方图(换算成比率)
  1. import pandas as pd
  2. from pandas import DataFrame
  3. import numpy as np
  4. import numpy.random as npr
  5. import matplotlib.pyplot as plt
  6. from scipy.stats import norm
  7. data = pd.read_csv('订单1.csv')#改文件名即可
  8. x = data['profit'].tolist()
  9. plt.rcParams['figure.figsize']=10,6
  10. plt.hist(x,bins=50,density=True,color='SkyBlue',edgecolor='b',alpha=0.6)
  11. overlay = np.linspace(min(x),max(x),2000)
  12. mean,std = norm.fit(x)
  13. pdf = norm.pdf(overlay,mean,std)
  14. plt.plot(overlay,pdf,'r-')
  15. plt.savefig("profit.png")
二、词云图
  1. import jieba
  2. import wordcloud
  3. import urllib.request
  4. from imageio import imread
  5. infile = open('word.txt','r',encoding='utf-8')
  6. t=infile.read()
  7. TextFile=open("delete.txt","rt",encoding="utf-8")
  8. stopwords=TextFile.read().splitlines()
  9. TextFile.close()
  10. mask=imread("background.jpg")
  11. ls=jieba.lcut(t)
  12. txt="".join(ls)
  13. w=wordcloud.WordCloud(font_path="msyh.ttc",mask=mask,width=1000,height=700,background_color="white",stopwords=stopwords)
  14. w.generate(txt)
  15. w.to_file("result.png")
三、数据标准化处理
  1. import numpy as np
  2. minValue = np.min(tAveData)
  3. maxValue = np.max(tAveData)
  4. AveData = (tAveData - minValue) / (maxValue - minValue)

 四、指标正向化处理代码

  1. import numpy as np
  2. import pandas as pd
  3. def nor_interval(data):
  4. min_val = data.min()
  5. max_val = data.max()
  6. normal = (data - min_val) / (max_val - min_val)
  7. return normal
  8. def nor_max(data):
  9. max_val = data.max()
  10. normal = data / max_val
  11. return normal
  12. def nor_min(data):
  13. min_val = data.min()
  14. normal = 1 - (data - min_val) / (data.max() - min_val)
  15. return normal
  16. def nor_zscore(data, bestnum):
  17. std_val = data.std()
  18. normal = (data - bestnum) / std_val
  19. return (normal + abs(normal.min())) / (normal.max() - normal.min())
  20. infodata = pd.read_csv('1_390.csv')
  21. xname = infodata.columns
  22. type = [-1,1,2,2,4,3,3,3] #1-区间型,2-极大型,3-极小型,4-中间型
  23. data = pd.DataFrame()
  24. for i in range(1,len(type)): #
  25. if type[i] == 1:
  26. data[xname[i]] = nor_interval(infodata[xname[i]])
  27. if type[i] == 2:
  28. data[xname[i]] = nor_max(infodata[xname[i]])
  29. if type[i] == 3:
  30. data[xname[i]] = nor_min(infodata[xname[i]])
  31. if type[i] == 4:
  32. data[xname[i]] = nor_zscore(infodata[xname[i]],40)
  33. data.to_excel('result.xlsx',index = False)
五、文件类型转化
  1. import numpy as np
  2. import pandas as pd
  3. import csv
  4. #转化文件类型
  5. out = open('train_data.csv','w',newline='') #要转成的.csv文件,先创建一个LF1big.csv文件
  6. csv_writer = csv.writer(out,dialect='excel')
  7. f = open("train_data.txt","r")
  8. for line in f.readlines():
  9. line=line.replace(' ','\t') #将每行的逗号替换成空格
  10. list = line.split() #将字符串转为列表,从而可以按单元格写入csv
  11. csv_writer.writerow(list)
六、计算相关性系数(Spearman和Pearson)

Spearman相关系数

  1. from scipy.stats import spearmanr
  2. corr, pValue = spearmanr(PreThreshold, AveDataSquare)
  3. print("斯皮尔曼相关系数:", corr, " P值:", pValue)

Pearson相关系数

  1. from scipy.stats import pearsonr
  2. corr, pValue = pearsonr(A, B)
  3. print("皮尔逊相关系数:", corr, " P值:", pValue)

具体相关实例

  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib
  4. import matplotlib.pyplot as plt
  5. matplotlib.rc("font", family='Microsoft YaHei')
  6. infodata = pd.read_excel('data2_Stand.xlsx')
  7. xname = infodata.columns
  8. txname = []
  9. for i in range(5,len(xname)):
  10. txname.append(xname[i])
  11. # txname.append('self')
  12. df = pd.DataFrame()
  13. df_result = pd.DataFrame()
  14. df_result['variable'] = txname
  15. for i in range(5,len(xname)):
  16. df['X'+str(i-4)] = infodata[str(xname[i])].tolist()
  17. plt.figure(figsize=(20, 14))
  18. plt.style.use('ggplot')
  19. fig,axes = plt.subplots(ncols = 4)
  20. ind = np.arange(0,len(txname))
  21. linelst = []
  22. for i in range(1,5):
  23. df['self'] = infodata[str(xname[i])].tolist()
  24. Hotdata =df.corr(method='spearman')
  25. line = Hotdata['self'].tolist()
  26. line = line[0:-1]
  27. linelst.append(line)
  28. df_result[str(xname[i])] = line
  29. plt.figure(figsize=(10,6), dpi=80)
  30. plt.figure(1)
  31. ax1 = plt.subplot(141)
  32. ax1.barh(txname,linelst[0],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
  33. plt.yticks(fontsize = 4)
  34. plt.xticks(fontsize = 6)
  35. ax2 = plt.subplot(142)
  36. ax2.barh(ind,linelst[1],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
  37. plt.yticks(fontsize = 6)
  38. plt.xticks(fontsize = 6)
  39. ax3 = plt.subplot(143)
  40. ax3.barh(ind,linelst[2],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
  41. plt.yticks(fontsize = 6)
  42. plt.xticks(fontsize = 6)
  43. ax4 = plt.subplot(144)
  44. ax4.barh(ind,linelst[3],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
  45. plt.yticks(fontsize = 6)
  46. plt.xticks(fontsize = 6)
  47. plt.show()
  48. df_result.to_excel('result.xlsx')
七、数据缺失值剔除
  1. import pandas as pd
  2. # 导入数据集
  3. data = pd.read_excel('Monohulled_Sailboats.xlsx')
  4. # 计算每列数据的缺失值占比并输出
  5. print("The proportion of missing values per column:")
  6. null_percentages = (data.isnull().sum() / len(data)) * 100
  7. print(null_percentages.apply(lambda x: '{:.2f}%'.format(x)))
  8. # 对缺失值占比低于10%的列对应的缺失值进行剔除
  9. clean_data = data.dropna(thresh=len(data)*0.9, axis=1)

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/282695
推荐阅读
相关标签
  

闽ICP备14008679号