赞
踩
结构化数据
的利器,利用python数据以及数据结构完成对结构化数据
的处理和分析功能。结构化数据集
**所需的工具集Series
和 DataFrame
import pandas as pd #使用pandas #1.创建 s1=pd.Series([1,2,3,4],index=("a","b","c","d")) s2=pd.Series({"a":1,"b":2,"c":3,"d":4}) print(s2) print(":"*100) print(s1)#显示索引 , 数据和dtype print(type(s1)) #<class 'pandas.core.series.Series'> #2.查询 print(s1["a"]) #3.更改 s1["a"]=100 print(s1) #4.属性信息打印 print(s2.index) #Index(['a', 'b', 'c', 'd'], dtype='object') print(s2.values) #[1 2 3 4] print(s2.shape) #(4,) print(s2.ndim) #1 print(s2.size) #4 print(s2.dtype) #int64 print(s2.head()) #前5行数据获取 s2.index.name="ok" print(s2) # ok # a 1 # b 2 # c 3 # d 4 # dtype: int64 print(s2.index.name) s2.name="hello" # value没有name属性 print(s2) # ok # a 1 # b 2 # c 3 # d 4 # Name: hello, dtype: int64 s3=pd.Series([-1,-2,3,-1,-4,3,0,0,1]) print(s3) print(s3.unique()) #[-1 -2 3 -4 0 1] 仅仅只做去重
dataframe是数据框? ===> DataFrame是一个表格型的数据结构,它含有一组有序的列,每列可以是不同类型的值。DataFrame既有行索引也有列索引,它可以被看做是由Series组成的字典(共用同一个索引),数据是以二维结构存放的。
dataFrame是处理二维以上数据组织形式
pd.DataFrame方式进行定义,指定行row或列columns
增删改查部分操作
iloc和loc方法(掌握)
总结:可以输入给DataFrame构造器数据的有如下类型:
(1)二维ndarry
(2)由数组、列表、元祖组成的字典
(3)由Series组成的字典
(4)由字典组成的字典
(5)另外一个DataFrame
import pandas as pd df1=pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]],index=("a","b","c"),columns=("A","B","C")) print(df1) # A B C # a 1 2 3 # b 4 5 6 # c 7 8 9 import numpy as np df2=pd.DataFrame(np.random.randn(4,4),index=("a","b","c","d"),columns=("A","B","C","D")) print(df2) #属性信息 print(df1.shape) print(df1.head(1)) print(df1.ndim) print(df1.info())# 重要属性 print(df1.columns) #Index(['A', 'B', 'C'], dtype='object') print(df1.index) #Index(['a', 'b', 'c'], dtype='object') # <class 'pandas.core.frame.DataFrame'> # Index: 3 entries, a to c # Data columns (total 3 columns): # A 3 non-null int64 # B 3 non-null int64 # C 3 non-null int64 # dtypes: int64(3) # memory usage: 96.0+ bytes #查询 print(df1["A"]) #通过[]指定对应的列信息 print("::"*100) print(df1.A) # 获取某一列的值 print("::"*100) print(df1.ix["a","A"]) print(df1.ix[:,"A"]) print(df1.ix["a",:]) print(df1.ix[:,:]) print("::"*100) # loc属性取得的行列名称对应的值 print(df1.loc[:,"A"]) print(df1.loc["a",:]) print(df1.loc[:,:]) print("::"*100) # iloc是根据索引值获取 print(df1.iloc[0,0]) print(df1.iloc[:,0]) print(df1.iloc[0,:]) print("+"*100) print(df1.iloc[0:2]) # iloc索引中包头不包尾 # A B C # a 1 2 3 # b 4 5 6 print(df1.iloc[0:2,0:2]) print(df1.loc["a":"c"]) # loc中包头包尾 print(df1.loc["a":"b"]) print(df1.loc["a":"b","A":"B"]) print(df1.loc[:, lambda df: ['A', 'B']]) # df表示变量 print("+"*100) # 日期操作 import pandas as pd import numpy as np df=pd.DataFrame(np.random.randn(5,4),cloumns=list('ABCD'),index=pd.data_range('20190101',period=5))#period=5表示向后5天 print(df) # A B C D # 2013-01-01 1.075770 -0.109050 1.643563 -1.469388 # 2013-01-02 0.357021 -0.674600 -1.776904 -0.968914 # 2013-01-03 -1.294524 0.413738 0.276662 -0.472035 # 2013-01-04 -0.013960 -0.362543 -0.006154 -0.923061 # 2013-01-05 0.895717 0.805244 -1.206412 2.565646 print(dfl.loc['20130102':'20130104']) print(dfl.loc['20130102':'20130104']) print(dfl.loc['20130102':'20130104',"A":"C"]) #更改 df1.ix["a","A"]=100 print(df1) #删除操作 #不能指定删除行和某一特定值 , 不支持迭代删除 del df1["A"] print(df1) #增加一列 df1["D"]=[1,2,3] print(df1)
#补充1:索引对象是不可以修改的:如 obj=Series(range(3),index=['a','b','c']) index=obj.index print(index) print(index[1:]) index[1]='d' #会报错 这个用法会保证index对象在多个数据结构之间安全共享。 #补充2:pandas对象重新索引 obj=Series(range(3),index=['a','b','c']) obj2=obj.reindex(['c','a','b','f']) obj3=obj.reindex(['c','a','b','f'],fill_value=0) obj4=obj.reindex(['c','a','b','f'],method='ffill') method可选: #ffill或pad前向填充值 #bfill或backfill 后向填充值 #补充3:丢弃指定轴上的项,按照索引删除 obj=Series(range(3),index=['a','b','c']) new_obj=obj.drop('c') print(new_obj) new_obj2=obj.drop(['a','b'])
import pandas as pd
s1=pd.Series(data=range(5),index=range(5))
s2=pd.Series(data=range(10),index=range(10))
print(s1+s2)
print(s1.add(s2,fill_value=100))
import numpy as np
df1=pd.DataFrame(data=np.ones((2,2)),columns=("a","b"))
df2=pd.DataFrame(data=np.ones((3,3)),columns=("a","b","c"))
print(df1+df2)
print(df1.add(df2,fill_value=100))
import pandas as pd import numpy as np print(np.random.randn(3)) df1=pd.DataFrame([np.random.randn(3),[np.nan,2,2],[1,np.nan,3]], columns=("A","B","C"),index=("a","b","c")) print(df1) #isnull print(df1.isnull()) #fillna print(df1.fillna(100)) #dropna print(df1.dropna(axis=0))# 删除含有nan的行 # 0 1 2 # 0 -0.85841 -0.647983 -1.414457 print(df1.dropna(axis=1))# 删除含有nan的列 # 2 # 0 -1.414457 # 1 2.000000 # 2 3.000000 # drop操作- Drop specified labels from rows or columns. import pandas as pd import numpy as np print(np.random.randn(3)) df1=pd.DataFrame([np.random.randn(3),[np.nan,2,2],[1,np.nan,3]], columns=("A","B","C"),index=("a","b","c")) print(df1.drop(["A"],axis=1)) # print(df1.drop([0])) print(df1.drop(["a"]))
import pandas as pd
import numpy as np
s1=pd.Series(np.random.randn(12),
index=[["a","a","a","b","b","b","c","c","c","d","d","d"],
[1,2,3,1,2,3,1,2,3,1,2,3]])
print(s1)
print(s1["a"][1])
s1["a"][1]=100
print(s1)
#两个index的互换
print(s1.swaplevel())
#两个index的互换且排序
print(s1.swaplevel().sortlevel())
列
进行计算import pandas as pd import numpy as np df1=pd.DataFrame(np.random.randn(5,4),index=("a","b","c","d","e"), columns=("A","B","C","D")) print(df1) # sum, mean, max, min… axis = 0 # 按行统计,axis = 1 # 按列统计,axis = 0 # skipna 排除缺失值, 默认为True print(df1.sum(axis=1)) print(df1.sum(axis=0)) print(df1.mean(axis=1)) print(df1.mean(axis=0)) print(df1.max(axis=0)) #description描述信息 print(df1.describe()) # A B C D # count 5.000000 5.000000 5.000000 5.000000 有几个样本数据 # mean -0.196643 0.510580 -0.076933 0.144531 均值 # std 0.759045 0.882974 0.542685 1.179448 标准差 # min -1.342553 -0.751580 -0.815040 -1.210786 最小值 # 25% -0.460303 0.352631 -0.388602 -0.983421 1/4分位数 # 50% -0.137811 0.460187 0.049356 0.623713 中位数 # 75% 0.422771 0.789292 0.168337 0.816850 3/4分位数 # max 0.534681 1.702369 0.601287 1.476297 最大值 print(df1) # query表示条件查询(布尔表达式) print(df1.query("A>B")) # # count 非Nan数量 print(df1.count()) # describe 针对各个列汇总统计 # min和max 最大最小值 # argmin、argmax 计算最大值或最小值对应的索引位置 # print(df1.argmin(axis=0)) # quantile 计算样本的分位数(0-1) print(df1.quantile(0.65)) # mean 均值 print(df1.mean(axis=0)) # median 中位数 # mad 平均绝对离差 # var 样本方差 # std 样本的标准差 # skew 样本值的偏度----正态分布的偏度为0 ---- 样本的3阶矩 # kurt 样本值的峰度------正态分布的峰度为3 ---- 样本的4阶矩 # cumsum样本值的累计和 print(df1.cumsum())
pd.read_csv(file,sep="")
import pandas as pd file=pd.read_csv("./SklearnTest.txt") print(file) #1.取出height列 print(file["height"]) print(file.height) #2.取出heigt和house两列 # ix已经过时了 print(file.ix[:,"height":"house"]) print(file.iloc[:,0:2]) print(file.loc[:,"height":"house"]) #3.取出样本数据0-5行样本数据 print(file.iloc[0:6,:]) print(file.loc[0:6,:]) #4.选择特征列和列别标签列 X1=file.ix[:,"height":"job"] print(X1) print(type(X1)) Y1=file.ix[:,"is_date"] print(Y1) print(type(Y1)) #<class 'pandas.core.series.Series'> # 选择需要的列 print(file[["height","car"]]) #5.选择is_date= new_Date=file.query("is_date==-1") data=file.query("is_date!=-1") print(new_Date) # height house car handsome job is_date # 8 1.65 0 1 6.6 0 -1 print(data) # height house car handsome job is_date # 0 1.80 1 0 6.5 2 1 # 1 1.62 1 0 5.5 0 1 # 2 1.71 0 1 8.5 1 1 # 3 1.58 1 1 6.3 1 1 # 4 1.68 0 1 5.1 0 0 # 5 1.63 1 0 5.3 1 0 # 6 1.78 0 0 4.5 0 0 # 7 1.64 0 0 7.8 2 0 #6.将data已经处理好的数据分为X和y X=data.iloc[:,0:5] y=data.iloc[:,5] print(X) print(type(X)) #<class 'pandas.core.frame.DataFrame'> print(y) print(type(y)) #<class 'pandas.core.series.Series'> X=data.drop("is_date",axis=1) print(X) print(type(X)) #<class 'pandas.core.frame.DataFrame'> # data["is_date"] # data.is_date # data.ix # data.iloc # data.loc X.to_csv("result.txt",sep=",") datafile=pd.read_csv("result.txt") print(datafile)
import pandas as pd
original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
print(original_df)
pd.to_pickle(original_df, "./dummy.pkl")
unpickled_df = pd.read_pickle("./dummy.pkl")
print(unpickled_df)
import numpy as np
import pandas as pd
print(pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True))
# (0.19, 3.367],(3.367, 6.533], (6.533, 9.7]
# ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],
# (6.533, 9.7], (0.191, 3.367]]
# Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]],
# array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ]))
pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3,
labels=["good","medium","bad"])
# [good, good, good, medium, bad, good]
# Categories (3, object): [good < medium < bad]
pd.cut(np.ones(5), 4, labels=False)
# array([1, 1, 1, 1, 1], dtype=int64)
print(pd.qcut(range(5), 3, labels=["good", "medium", "bad"]))
import pandas as pd s1 = pd.Series(['a', 'b']) s2 = pd.Series(['c', 'd']) print(s1) print(s2) print(pd.concat([s1, s2],axis=0)) print(pd.concat([s1, s2],axis=1)) import numpy as np df1 = pd.DataFrame([['a', 1], ['b', 2]],columns = ['letter', 'number']) print(df1) df2 = pd.DataFrame([['c', 3], ['d', 4]],columns = ['letter', 'number']) print(df2) print(pd.concat([df1, df2],ignore_index=True)) print(pd.concat([df1, df2],ignore_index=True,axis=1)) # print(np.vstack((df1,df2))) # print(np.hstack((df1,df2)))
#one-hot编码形式 import pandas as pd s = pd.Series(list('abca')) print(s) # 0 a # 1 b # 2 c # 3 a pd.get_dummies(s) # a b c # 0 1 0 0 # 1 0 1 0 # 2 0 0 1 # 3 1 0 0 df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}) print(df) # A B C # 0 a b 1 # 1 b a 2 # 2 a c 3 print(pd.get_dummies(df, prefix=['col1', 'col2'])) # prefix前缀:字符串,字符串,列表或dict的字符串,默认没有字符串附 # 加数据帧通过列名列表长度等于列数当调用数据帧上的。 # 另外,“前缀”可以是一个字典映射列名称前缀。 # C col1_a col1_b col2_a col2_b col2_c # 0 1 1 0 0 1 0 # 1 2 0 1 1 0 0 # 2 3 1 0 0 0 1 # pd.factorize() # #实现说明:该方法负责3 # # # 1)强迫数据数组类(ndarray、索引扩展数组) # # 2)分解标签和--物品 # # 3)也许索引中的输出
import pandas as pd
s = pd.Series(['1.0', '2', -3])
print(pd.to_numeric(s))
s = pd.Series(['apple', '1.0', '2', -3])
# print(pd.to_numeric(s))
print(pd.to_numeric(s, errors='ignore'))
print(pd.to_numeric(s, errors='coerce'))
聚合操作
print(data.groupby(by=["one"])["two"].mean()) #等价写法
print(data["two"].groupby(by=data["one"]).mean())
#homework-学习groupy用法 # data.groupby(func, axis=0).mean() # data.groupby(['col1', 'col2'])['col3'].mean() import pandas as pd data=pd.DataFrame([[1,2,3],[1,3,3],[4,5,6],[4,3,6]],columns=["one","two","three"]) print(data) # one two three # 0 1 2 3 # 0 1 3 3 # 1 4 5 6 # 1 4 3 6 print(data.groupby(by=["one"])["two"]) # <pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000003EDB248748> print(data.groupby(by=["one"])["two"].mean()) # one # 1 2.5 # 4 4.0 # Name: two, dtype: float64 print(data.groupby(by=["one"])["two"].mean()) # one # 1 2.5 # 4 4.0 # Name: two, dtype: float64 #等价写法 print(data["two"].groupby(by=data["one"]).mean()) # print(data.groupby(['one', 'two']).mean()) # one # 1 2.5 # 4 4.0 # Name: two, dtype: float64 import numpy as np #根据one列进行聚合,在对其余列进行求解均值 print(data.groupby(['one']).mean()) # two three # one # 1 2.5 3.0 # 4 4.0 6.0 print("=="*100) #根据one列进行聚合,在对其余列进行求解均值 print(data.groupby(["one"]).transform(lambda x:np.mean(x))) # two three # 0 2.5 3.0 # 1 2.5 3.0 # 2 4.0 6.0 # 3 4.0 6.0 print(data.groupby(["one"]).transform(lambda x:(x-np.mean(x))/(np.std(x)))) # two three # 0 -1.0 NaN # 1 1.0 NaN # 2 1.0 NaN # 3 -1.0 NaN print("=="*100) #根据one列进行聚合,在对指定列进行求解均值,如”two“ print(data.groupby(["one"]).apply(lambda data:np.mean(data["two"]))) # one # 1 2.5 # 4 4.0 # dtype: float64 df = pd.DataFrame(np.random.randn(3, 3),columns=["a","b","c"],dtype="int32") # >>> df # 0 1 2 # 0 -0.029638 1.081563 1.280300 # 1 0.647747 0.831136 -1.549481 # 2 0.513416 -0.884417 0.195343 # df = df.applymap(lambda x: '%.2f' % x) # print(df) # apply让方程作用在一维的向量上时,可以使用apply来完成 # 使用apply应用行或列数据 # print(df.apply(lambda x:x.max(),axis=0)) # 默认axis=0 , 方向是列 ; axis=1 , 方向是行 # applymap让方程作用于DataFrame中的每一个元素,可以使用applymap() # df1 = df.applymap(lambda x: x.max()) # print(df1) print("****************************") # df # 0 1 2 # 0 -0.03 1.08 1.28 # 1 0.65 0.83 -1.55 # 2 0.51 -0.88 0.20 # df2=df.drop(["a"],axis=1) df2=df.drop("a",axis=1) print(df2) # b c # 0 0 0 # 1 0 -1 # 2 0 0 df3=pd.DataFrame(np.random.randn(3, 3),columns=["a","b","c"],index=["one","two","three"]) print(df3) # a b c # one 1.022522 -2.295879 0.052196 # two -1.263835 -1.057730 0.335293 # three 0.517390 1.520645 -0.881983 df4=df3.reindex(["three","two","one"]) print(df4) # a b c # three 0.517390 1.520645 -0.881983 # two -1.263835 -1.057730 0.335293 # one 1.022522 -2.295879 0.052196 # df4.select("two")
# 获取数据 import pandas as pd # 电影数据集 movies_data = pd.read_csv('movies.cav') # 评分数据集 ratings_data = pd.read_csv('ratings.cav') # 查看数据 print(movies_data) print(ratings_data) # 合并数据 data = pd.merge(movies_data, ratings_data) # 删除列 , 无影响 data.drop('timestamp', 1, inplace=True) # 查看评价最多的20部电影 data.title.value_counts()[:20] # 评价最高的电影 import numpy as np # size是每部电影参评人数 , mean是平均数 movies_ratings = data.groupby('title').agg({'rating': [np.size, np.mean]}) movies_ratings.head() # 评分最高的前5名 ---- top5 movies_ratings.sort_value([('rating', 'mean')], ascending=False).head(5) # 虽然评分是5分,但是……评分只有1人啊,这样绝不是我们想看到的,因为不是很客观,于是,我们再来改进下 # 把评价人数大于150人的电影找出来,在进行统计: movies_ratings[movies_ratings['rating']['size'] >= 150].sort_value([('rating', 'mean')], ascending=False).head(5) # 推荐电影 import re name = input("Please input the movie name(or keywords): ") name = name.title() # 首字母大写 mov = movies_data[movies_data['title'].str.contains(name)] # 提取电影名字 y = str(mov['genres']).strip('Name: genres, dtype: object').strip() p = re.compile(r'\D+') result = p.findall(y) res = result[0].strip() movies_lists = movies_data[movies_data['genres'] == res] movies_lists # movieId, title, genres # movieId:每部电影的id # title:电影的标题 # genres:电影的类别
import matplotlib.pyplot as plt
plt.plot([1,2],[2,1])
plt.show()
__version__
进行输出u"标题内容",fontproperties="SimHei"
import matplotlib as pl
# 1.打印Matplotlib版本
print(pl.__version__) #2.2.2
#2.绘制y=x+5和y=2x+5两条曲线
import numpy as np
import matplotlib.pyplot as plt
x=np.linspace(1,10,50)
y1=x+5
y2=2*x+5
plt.plot(x,y1)
plt.plot(x,y2)
plt.title(u"This is y=X 函数",fontproperties="SimHei")
plt.savefig("sen.jpg")
#显示
plt.show()
#需求:创建子图,并绘制+-x和+-x的拟合直线图(即:给定相同坐标的x) # 示例代码: #采用的figure对象 import matplotlib.pyplot as plt import numpy as np fig=plt.figure() ax1 = fig.add_subplot(2, 2, 1) # 2行2列第一个 ax2 = fig.add_subplot(2, 2, 2) ax3 = fig.add_subplot(2, 2, 3) ax4 = fig.add_subplot(2, 2, 4) # 在subplot上作图 x = np.arange(1, 100) ax1.plot(x, x) ax2.plot(x, -x) ax3.plot(-x, x) ax4.plot(-x, -x) plt.show() #采用plt的方式 x=np.arange(1,100) plt.subplot(221) plt.plot(x,x) plt.subplot(222) plt.plot(x,-x) plt.show() # =================================================== import matplotlib.pyplot as plt fig=plt.figure(figsize=(10,20)) ax1=fig.add_subplot(212) ax2=fig.add_subplot(222) ax3=fig.add_subplot(223) ax4=fig.add_subplot(224) ax1.plot([1,2],[2,1]) ax2.plot([1,2],[2,1]) ax3.plot([1,2],[2,1]) ax4.plot([1,2],[2,1]) fig2=plt.figure(figsize=(10,20)) ax2=fig2.add_subplot(111) ax2.plot([1,2],[2,1]) plt.show()
import matplotlib.pyplot as plt fig=plt.figure() ax1=fig.add_subplot(2,2,1) ax2=fig.add_subplot(2,2,2) ax3=fig.add_subplot(2,2,3) ax4=fig.add_subplot(2,2,4) ax1.plot([1,2],[2,1]) ax2.plot([1,-2],[-2,1]) ax3.plot([-1,-2],[-2,-1]) ax4.plot([1,-2],[2,-1]) # or import matplotlib.pyplot as plt ax1=plt.subplot(221) ax2=plt.subplot(222) ax3=plt.subplot(223) ax4=plt.subplot(224) ax1.plot([1,2],[2,1]) ax2.plot([1,-2],[-2,1]) ax3.plot([-1,-2],[-2,-1]) ax4.plot([1,-2],[2,-1]) # or import matplotlib.pyplot as plt ax1=plt.subplot(221) plt.sca(ax1) plt.plot([1,2],[2,1]) ax2=plt.subplot(222) plt.sca(ax2) plt.plot([1,-2],[-2,1]) ax3=plt.subplot(223) plt.sca(ax3) plt.plot([-1,-2],[-2,-1]) ax4=plt.subplot(224) plt.sca(ax4) plt.plot([1,-2],[2,-1]) plt.show()
# 需求:使用figure的方式创建子图并做出直方图、散点图、折线图、饼状图、小提琴图 # 指定切分区域的位置 import matplotlib.pyplot as plt import numpy as np fig = plt.figure() ax1 = fig.add_subplot(2, 2, 1) ax2 = fig.add_subplot(2, 2, 2) ax3 = fig.add_subplot(2, 2, 3) ax4 = fig.add_subplot(2, 2, 4) # 12.3直方图:hist ax1.hist(np.random.randn(100), bins=10, color='b', alpha=0.3) # 12.4散点图:scatter x = np.arange(1,2 * np.pi, 0.1) y = np.cos(x) ax2.scatter(x, y) # 12.4 折线图 #画出-10到10区间的二次函数图 #100改为10就能看出折线图 a=np.linspace(-10,10,100) #平均分为100份 b=a**2 ax3.plot(a,b) # 12.5饼状图 # 饼状图显示一个数据中各项的大小与各项和的比例 # 饼状图中显示为在整个饼状图中的比例 # labelx = ['A', 'B', 'C', 'D'] # fracs = [15, 30, 45, 10] # 1.必须设置比例为1:1才能显示为圆形 # plt.axes(aspect=1) # 2.加每一块所占有具体比例的值autopct # 3.突出显示explode # explode = [0, 0.2, 0, 0] # 块B远离了饼状图中心 # 3.加阴影shadow # ax4.pie(x=fracs, labels=labelx, autopct='%.1f%%', explode=explode, shadow=True) x=[0.2,0.4,0.2,0.2] labels=["A","B","C","D"] ax3.pie(x, explode=[0,0.4,0,0], labels=labels,shadow=True) # #箱线图 data = np.random.normal(size=1000, loc=0, scale=1) # #调整异常点的形状sym='o' # #whis参数表示:虚线的长度,默认1.5(比例),盒子距离上下四分位数的距离 # #距离越大虚线越长,设置成0.5和100分别观察 ax4.boxplot(data, sym='o', whis=1.5) plt.show()
参考代码:https://matplotlib.org/gallery/statistics/barchart_demo.html
import matplotlib.pyplot as plt
fig = plt.figure()
ax1 = fig.add_subplot(111)
import numpy as np
x = np.linspace(1, 10, 50)
ax1.scatter(x, x ** 2)
ax1.grid(color="r", linestyle="--", linewidth=10)
plt.show()
import matplotlib.pyplot as plt
import numpy as np
x=np.arange(1,11)
plt.plot(x,x**2,label='Normal')
plt.plot(x,x**3,label="Fast")
plt.plot(x,x**4,label="Faster")
#方式1
#ncol=3扁平的效果,排为一列
# plt.legend(loc=2,ncol=3)#1是右上角2是左上角3左下角4右下角0是best , ncol表示分为几列
# 方式2:
plt.legend(['Normal',"Fast","Faster"])
plt.show()
ax.plot(x, y, 'r--') 等价于ax.plot(x, y, linestyle='--', color='r') 示例代码: import matplotlib.pyplot as plt import numpy as np fig, axes = plt.subplots(2) axes[0].plot(np.random.randint(0, 100, 50), 'ro--') # 等价 axes[1].plot(np.random.randint(0, 100, 50), color='r', linestyle='dashed', marker='o') 常用的颜色、标记、线型:<> marker . point , pixel o circle v 下三角形 ^ 上三角形 < 左三角形 color b:blue g:green r:red c:cyan m:magenta y:yellow k:black w:white linestyle - or solid 粗线 -- or dashed dashed line -. or dashdot dash-dotted : or dotted dotted line 'None' draw nothing ' ' or '' 什么也不绘画
方式一 :
import matplotlib.pyplot as plt import numpy as np X=np.linspace(-10,10,100) y1=np.sin(X) y2=np.sign(X) y3=np.tanh(X) fig=plt.figure(figsize=(20,20)) ax1=fig.add_subplot(221) ax1.plot(X,y1,"r--",linewidth=2) ax1.set_title("Y=sin(x)") ax1.set_xlim(2,8) ax1.grid(color="b") ax1.legend(loc=0,labels=["Y=sin(x)"]) ax2=fig.add_subplot(222) ax2.plot(X,y2,"r-.",linewidth=5,label=["Y=sign(x)"]) ax2.set_title("Y=SIGN(X)") ax2.set_xlim(-5,5) ax2.set_ylim(-2,2) ax2.legend(loc=0) ax3=fig.add_subplot(212) ax3.plot(X,y3) ax3.set_xlabel("X") ax3.set_ylabel("Tahn(X)") ax3.grid() ax3.legend(loc=1,labels=["Y=tanh(X)"]) plt.show()
方式二 :
import matplotlib.pyplot as plt import numpy as np X=np.linspace(-10,10,100) y1=np.sin(X) y2=np.sign(X) y3=np.tanh(X) plt.figure(figsize=(15,15)) plt.subplot(221) plt.plot(X,y1,"r--",linewidth=2) plt.title("Y=sin(x)") plt.xlim(2,8) plt.grid(color="b") plt.legend(loc=0,labels=["Y=sin(x)"]) plt.subplot(222) plt.plot(X,y2,"r-.",linewidth=5,label=["Y=sign(x)"]) plt.title("Y=SIGN(X)") plt.xlim(-5,5) plt.ylim(-2,2) plt.legend(loc=0) plt.subplot(212) plt.plot(X,y3) plt.xlabel("X") plt.ylabel("Tahn(X)") plt.grid() plt.legend(loc=1,labels=["Y=tanh(X)"]) plt.show()
import matplotlib.pyplot as plt import pandas as pd import numpy as np import seaborn as sns sns.set(style="darkgrid") tips = sns.load_dataset("tips") #1.处理基础数据 #total_bill和tip关系 # sns.relplot(x="total_bill", y="tip", data=tips); # sns.relplot(x="total_bill", y="tip", hue="smoker", data=tips); sns.relplot(x="total_bill", y="tip", hue="smoker", col="time", data=tips); #2.处理时间数据 # df = pd.DataFrame(dict(time=pd.date_range("2017-1-1", periods=500), # value=np.random.randn(500).cumsum())) # print(df.head()) # g = sns.relplot(x="time", y="value", kind="line", data=df) # g.fig.autofmt_xdate() # 3.分类变量 # sns.catplot(x="day", y="total_bill", data=tips); # sns.catplot(x="day", y="total_bill", hue="sex", kind="swarm", data=tips); # sns.catplot(x="smoker", y="tip", order=["No", "Yes"], data=tips); # sns.catplot(x="day", y="total_bill", kind="box", data=tips); # sns.boxplot(x="day",y="total_bill",hue="smoker",data=tips) plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
titanic = sns.load_dataset("titanic")
# sns.catplot(x="sex", y="survived", hue="class", kind="bar", data=titanic);
# sns.catplot(x="deck", kind="count", palette="ch:.25", data=titanic);
sns.catplot(y="deck", hue="class", kind="count",
palette="pastel", edgecolor=".6",
data=titanic);
plt.show()
from scipy import signal,misc
import numpy as np
import matplotlib.pyplot as plt
image=misc.ascent() #二维图像,公寓图像
w=np.zeros((50,50))
w[0][0]=1.0 #修改参数调整滤波器
w[49][25]=1.0 #可以根据需要调整
image_new=signal.fftconvolve(image,w) #使用FFT算法进行卷积
plt.figure()
plt.imshow(image_new) #显示滤波后的图像
plt.gray()
plt.title("Filteres image!")
plt.show()
from sklearn import linear_model
reg = linear_model.LinearRegression()
x = [[0, 0], [1, 1], [2, 2]]
y = [0, 1, 2]
reg.fit (x, y) #Fit linear model.
# LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
# normalize=False)
print(reg.coef_) #[0.5 0.5] w1x1+w2x2+w0 w1=0.5,w2=0.5
print(reg.intercept_) #w0=0
import numpy as np
print(np.allclose(0,reg.intercept_))
# array([0.5, 0.5])
print(reg.predict([[0,0],[1,1]]))
Scipy的API官网 : https://docs.scipy.org
利用svds和eigs等进行矩阵分解,从而实现矩阵分解方式的推荐系统
from scipy.sparse import csc_matrix import numpy as np # from scipy.sparse.linalg import svds, eigs from scipy.sparse import linalg A = csc_matrix([[1, 0, 0], [5, 0, 2], [0, -1, 0], [0, 0, 3]], dtype=float) u, s, vt = linalg.svds(A, k=2) print(u) # [[-1.73323831e-01 1.56782328e-01] # [-2.27856346e-01 9.54078802e-01] # [-7.09160926e-19 2.32081766e-19] # [ 9.58144214e-01 2.55250744e-01]] print(s) # [2.75193379 5.6059665 ] print(vt) # [[-4.76975707e-01 1.95156391e-18 8.78916478e-01] # [ 8.78916478e-01 -1.30104261e-18 4.76975707e-01]] # array([ 2.75193379, 5.6059665 ]) print(np.sqrt(linalg.eigs(A.dot(A.T), k=2)[0]).real) # 特征向量 # 根号下特征值就是奇异值 # array([ 5.6059665 , 2.75193379])
基于协同过滤系统的Python实现
import numpy as np import pandas as pd header = ['user_id', 'item_id', 'rating', 'timestamp'] df = pd.read_csv('ml-100k/u.data', sep='\t', names=header) n_users = df.user_id.unique().shape[0] n_items = df.item_id.unique().shape[0] Print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) # Number of users = 943 | Number of movies = 1682 from sklearn import cross_validation as cv train_data, test_data = cv.train_test_split(df, test_size=0.25) #Create two user-item matrices, one for training and another for testing train_data_matrix = np.zeros((n_users, n_items)) for line in train_data.itertuples(): train_data_matrix[line[1]-1, line[2]-1] = line[3] test_data_matrix = np.zeros((n_users, n_items)) for line in test_data.itertuples(): test_data_matrix[line[1]-1, line[2]-1] = line[3] from sklearn.metrics.pairwise import pairwise_distances user_similarity = pairwise_distances(train_data_matrix, metric='cosine') item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。