赞
踩
数据分组(按照定类数据对定量数据进行切割,并输出到指定路径)
-
- import pandas as pd
- df = pd.read_csv('hospital2_data1_pro.csv')
- grouped = df.groupby(['既往应用节育器情况', '使用节育器型号情况'])
- for group_name, group_data in grouped:
- file_name = f"{group_name[0]}_{group_name[1]}.xlsx"
- file_path = f"D:\\Desktop\\{file_name}"
- group_data.to_excel(file_path, index=False)
快速数据处理(剔除、变换)
-
- mask = (df['Delivering city'] == DC) & (df['Receiving city'] == RC)
- df = df.loc[mask]
快速数据处理_时间(提取)
-
- import pandas as pd
- startTime = '2022-07-01'
- endTime = '2022-09-30'
- start_date = pd.to_datetime(startTime)
- end_date = pd.to_datetime(endTime)
- mask = (df['Delivering city'] >= start_date) & (df['Receiving city'] <= end_date)
- df = df.loc[mask]
- df.to_csv('result.csv',index = False)
折线图小技巧
-
- plt.xticks(rotation = 90,fontsize = 12)
- plt.xticks(range(1,len(ind),5))
- plt.legend(fontsize=18)
- plt.text(maxs,tAveData[maxs],round(PreThreshold[maxs],2),fontsize = 20) #保留两位小数
缺失值处理
-
- import pandas as pd
- df = pd.read_csv('anormal.csv')
- missing_values = df.isna().sum()
- missing_value_ratio = (missing_values / len(df)).round(4) * 100
- for column, ratio in missing_value_ratio.items():
- print(f"{column}: {ratio:.2f}%")
- import pandas as pd
- import matplotlib.pyplot as plt
- import matplotlib
- matplotlib.rc("font", family='Microsoft YaHei')
-
- df = pd.read_excel("附件.xlsx")
-
- missing_counts = df.isnull().sum()
- total_rows = len(df)
- non_missing_percentages = (1 - (missing_counts / total_rows)) * 100
-
- colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6', '#ffdd99', '#ff6666', '#c2f0c2', '#e6ac00']
- plt.figure(figsize=(10, 6))
- plt.bar(range(len(non_missing_percentages)), non_missing_percentages,color = colors)
- plt.xticks(range(len(non_missing_percentages)), df.columns, rotation=45, ha="right")
-
- for i, percentage in enumerate(non_missing_percentages):
- plt.text(i, percentage + 1, f"{percentage:.0f}%", ha="center")
- plt.tight_layout()
-
- plt.show()
-
- import pandas as pd
- from pandas import DataFrame
- import numpy as np
- import numpy.random as npr
- import matplotlib.pyplot as plt
- from scipy.stats import norm
-
- data = pd.read_csv('订单1.csv')#改文件名即可
- x = data['profit'].tolist()
-
- plt.rcParams['figure.figsize']=10,6
- plt.hist(x,bins=50,density=True,color='SkyBlue',edgecolor='b',alpha=0.6)
-
- overlay = np.linspace(min(x),max(x),2000)
- mean,std = norm.fit(x)
- pdf = norm.pdf(overlay,mean,std)
- plt.plot(overlay,pdf,'r-')
-
- plt.savefig("profit.png")
-
- import jieba
- import wordcloud
- import urllib.request
- from imageio import imread
-
- infile = open('word.txt','r',encoding='utf-8')
- t=infile.read()
-
- TextFile=open("delete.txt","rt",encoding="utf-8")
- stopwords=TextFile.read().splitlines()
- TextFile.close()
-
- mask=imread("background.jpg")
-
- ls=jieba.lcut(t)
- txt="".join(ls)
- w=wordcloud.WordCloud(font_path="msyh.ttc",mask=mask,width=1000,height=700,background_color="white",stopwords=stopwords)
- w.generate(txt)
- w.to_file("result.png")
-
- import numpy as np
- minValue = np.min(tAveData)
- maxValue = np.max(tAveData)
- AveData = (tAveData - minValue) / (maxValue - minValue)
四、指标正向化处理代码
- import numpy as np
- import pandas as pd
-
- def nor_interval(data):
- min_val = data.min()
- max_val = data.max()
- normal = (data - min_val) / (max_val - min_val)
- return normal
-
- def nor_max(data):
- max_val = data.max()
- normal = data / max_val
- return normal
-
- def nor_min(data):
- min_val = data.min()
- normal = 1 - (data - min_val) / (data.max() - min_val)
- return normal
-
- def nor_zscore(data, bestnum):
- std_val = data.std()
- normal = (data - bestnum) / std_val
- return (normal + abs(normal.min())) / (normal.max() - normal.min())
-
- infodata = pd.read_csv('1_390.csv')
- xname = infodata.columns
- type = [-1,1,2,2,4,3,3,3] #1-区间型,2-极大型,3-极小型,4-中间型
- data = pd.DataFrame()
- for i in range(1,len(type)): #
- if type[i] == 1:
- data[xname[i]] = nor_interval(infodata[xname[i]])
- if type[i] == 2:
- data[xname[i]] = nor_max(infodata[xname[i]])
- if type[i] == 3:
- data[xname[i]] = nor_min(infodata[xname[i]])
- if type[i] == 4:
- data[xname[i]] = nor_zscore(infodata[xname[i]],40)
-
- data.to_excel('result.xlsx',index = False)
-
- import numpy as np
- import pandas as pd
- import csv
-
- #转化文件类型
- out = open('train_data.csv','w',newline='') #要转成的.csv文件,先创建一个LF1big.csv文件
- csv_writer = csv.writer(out,dialect='excel')
-
- f = open("train_data.txt","r")
- for line in f.readlines():
- line=line.replace(' ','\t') #将每行的逗号替换成空格
- list = line.split() #将字符串转为列表,从而可以按单元格写入csv
- csv_writer.writerow(list)
Spearman相关系数
-
- from scipy.stats import spearmanr
- corr, pValue = spearmanr(PreThreshold, AveDataSquare)
- print("斯皮尔曼相关系数:", corr, " P值:", pValue)
Pearson相关系数
-
- from scipy.stats import pearsonr
- corr, pValue = pearsonr(A, B)
- print("皮尔逊相关系数:", corr, " P值:", pValue)
具体相关实例
-
- import pandas as pd
- import numpy as np
- import matplotlib
- import matplotlib.pyplot as plt
- matplotlib.rc("font", family='Microsoft YaHei')
- infodata = pd.read_excel('data2_Stand.xlsx')
- xname = infodata.columns
- txname = []
- for i in range(5,len(xname)):
- txname.append(xname[i])
- # txname.append('self')
- df = pd.DataFrame()
- df_result = pd.DataFrame()
- df_result['variable'] = txname
- for i in range(5,len(xname)):
- df['X'+str(i-4)] = infodata[str(xname[i])].tolist()
- plt.figure(figsize=(20, 14))
- plt.style.use('ggplot')
- fig,axes = plt.subplots(ncols = 4)
- ind = np.arange(0,len(txname))
- linelst = []
- for i in range(1,5):
- df['self'] = infodata[str(xname[i])].tolist()
- Hotdata =df.corr(method='spearman')
- line = Hotdata['self'].tolist()
- line = line[0:-1]
- linelst.append(line)
- df_result[str(xname[i])] = line
- plt.figure(figsize=(10,6), dpi=80)
- plt.figure(1)
- ax1 = plt.subplot(141)
- ax1.barh(txname,linelst[0],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
- plt.yticks(fontsize = 4)
- plt.xticks(fontsize = 6)
- ax2 = plt.subplot(142)
- ax2.barh(ind,linelst[1],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
- plt.yticks(fontsize = 6)
- plt.xticks(fontsize = 6)
- ax3 = plt.subplot(143)
- ax3.barh(ind,linelst[2],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
- plt.yticks(fontsize = 6)
- plt.xticks(fontsize = 6)
- ax4 = plt.subplot(144)
- ax4.barh(ind,linelst[3],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
- plt.yticks(fontsize = 6)
- plt.xticks(fontsize = 6)
- plt.show()
- df_result.to_excel('result.xlsx')
-
- import pandas as pd
- # 导入数据集
- data = pd.read_excel('Monohulled_Sailboats.xlsx')
- # 计算每列数据的缺失值占比并输出
- print("The proportion of missing values per column:")
- null_percentages = (data.isnull().sum() / len(data)) * 100
- print(null_percentages.apply(lambda x: '{:.2f}%'.format(x)))
- # 对缺失值占比低于10%的列对应的缺失值进行剔除
- clean_data = data.dropna(thresh=len(data)*0.9, axis=1)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。