赞
踩
实例化类DictVectorizer
调用fit_transform方法输入数据并转换 注意返回格式
DictVectorizer(sparse=True,…)
DictVectorizer.fit_transform(X)
X:字典或者包含字典的迭代器
返回值:返回sparse矩阵
DictVectorizer.inverse_transform(X)
X:array数组或者sparse矩阵
返回值:转换之前数据格式
DictVectorizer.get_feature_names()
返回类别名称
DictVectorizer.transform(X)
按照原先的标准转换
from sklearn.feature_extraction.text import CountVectorizer
#实例化CountVectorizer
vector = CountVectorizer()
#调用fit_transform 输入并转化数据
res = vector.fit_transform(["life is short,i like python","life is too long,i dislike python"])
print(vector.get_feature_names_out())
print(res.toarray())
from sklearn.feature_extraction import DictVectorizer def dictvec(): """ :return:None """ #实例化 dict = DictVectorizer() #调用fit_transform data= dict.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) print(data) return None if __name__ == "__main__": dictvec()
from sklearn.feature_extraction import DictVectorizer def dictvec(): """ :return:None """ #实例化 dict = DictVectorizer(sparse=False) #调用fit_transform data= dict.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) print(data) return None if __name__ == "__main__": dictvec()
from sklearn.feature_extraction import DictVectorizer def dictvec(): """ :return:None """ #实例化 dict = DictVectorizer(sparse=False) #调用fit_transform data= dict.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) print(dict.get_feature_names_out()) print(data) return None if __name__ == "__main__": dictvec()
作用:对文本数据进行特征值化
类:sklearn.feature_extraction.text.CountVectorizer
CountVectorizer(max_df=1.0,min_df=1,…)
返回词频矩阵
CountVectorizer.fit_transform(X,y)
X:文本或者包含文本字符串的可迭代对象
返回值:返回sparse矩阵
CountVectorizer.inverse_transform(X)
X:array数组或者sparse矩阵
返回值:转换之前数据格式
CountVectorizer.get_feature_names()
返回值:单词列表
from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer # def dictvec(): # """ # :return:None # """ # #实例化 # dict = DictVectorizer(sparse=False) # #调用fit_transform # data= dict.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) # print(dict.get_feature_names_out()) # print(dict.inverse_transform(data)) # return None def countvec(): """ 对文本进行特征值化 :return: None """ cv = CountVectorizer() data =cv.fit_transform(["life is short,i like python","life is too long,i dislike python"]) print(cv.get_feature_names_out()) print(data.toarray()) if __name__ == "__main__": # dictvec() countvec()
from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer # def dictvec(): # """ # :return:None # """ # #实例化 # dict = DictVectorizer(sparse=False) # #调用fit_transform # data= dict.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) # print(dict.get_feature_names_out()) # print(dict.inverse_transform(data)) # return None def countvec(): """ 对文本进行特征值化 :return: None """ cv = CountVectorizer() data =cv.fit_transform(["人生 苦短,我 喜欢 python","人生 漫长,不用 python"])#要进行分词处理 print(cv.get_feature_names_out()) print(data.toarray()) if __name__ == "__main__": # dictvec() countvec()
导入jieba:
from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer import jieba # def dictvec(): # """ # :return:None # """ # #实例化 # dict = DictVectorizer(sparse=False) # #调用fit_transform # data= dict.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) # print(dict.get_feature_names_out()) # print(dict.inverse_transform(data)) # return None def countvec(): """ 对文本进行特征值化 :return: None """ cv = CountVectorizer() data =cv.fit_transform(["人生 苦短,我 喜欢 python","人生 漫长,不用 python"])#要进行分词处理 print(cv.get_feature_names_out()) print(data.toarray()) def cutword(): con1 = jieba.cut("今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。") con2 = jieba.cut("我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。") con3 = jieba.cut("如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。") #转化成列表 conten1 = list(con1) conten2 = list(con2) conten3 = list(con3) #把列表转换成字符串 c1 = ' '.join(conten1) c2 = ' '.join(conten2) c3 = ' '.join(conten3) return c1,c2,c3 def hangzivec(): """ 中文特征值化 :return: None """ c1,c2,c3 = cutword() print(c1,c2,c3) cv = CountVectorizer() data = cv.fit_transform([c1,c2,c3]) print(cv.get_feature_names_out()) print(data.toarray()) return None if __name__ == "__main__": # dictvec() # countvec() hangzivec()
TF : term frequency:词的频率
idf :逆文档频率 inverse document frequency
log(总文档数量/该词出现的文档数量)
log(数值):输入的值越小,结果越小
TF*IDF 重要性
from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer import jieba def cutword(): con1 = jieba.cut("今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。") con2 = jieba.cut("我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。") con3 = jieba.cut("如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。") #转化成列表 conten1 = list(con1) conten2 = list(con2) conten3 = list(con3) #把列表转换成字符串 c1 = ' '.join(conten1) c2 = ' '.join(conten2) c3 = ' '.join(conten3) return c1,c2,c3 def tfidfivec(): """ 中文特征值化 :return: None """ c1,c2,c3 = cutword() print(c1,c2,c3) tf =TfidfVectorizer() data = tf.fit_transform([c1,c2,c3]) print(tf.get_feature_names_out()) print(data.toarray()) return None if __name__ == "__main__": tfidfivec()
特征处理API:sklearn.prerocessing
sklearn归一化API: sklearn.preprocessing.MinMaxScaler
特征同等重要的时候 使用归一化。
目的:使得某一个特征对最终结果不会造成更大的影响。
from sklearn.preprocessing import MinMaxScaler def mm(): """ 归一化处理 :return: None """ mm = MinMaxScaler() data = mm.fit_transform([[90,2,10,40],[60,4,15,45],[75,3,13,46]]) print(data) return None if __name__ == "__main__": mm()
from sklearn.preprocessing import MinMaxScaler
def mm():
"""
归一化处理
:return: None
"""
mm = MinMaxScaler(feature_range=(2,3))
data = mm.fit_transform([[90,2,10,40],[60,4,15,45],[75,3,13,46]])
print(data)
return None
if __name__ == "__main__":
mm()
归一化总结:
注意在特定场景下最大值最小值是变化的,另外,最大值与最小值非常容易受异常点影响,所以这种方法鲁棒性较差,只适合传统精确小数据场景。
特点:通过对原始数据进行变换把数据变换到均值为0,方差为1范围内
from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer from sklearn.preprocessing import MinMaxScaler,StandardScaler def stand(): """ 标准化缩放 :return: None """ std = StandardScaler() data = std.fit_transform([[ 1., -1., 3.],[ 2., 4., 2.],[ 4., 6., -1.]]) print(data) return None if __name__ == "__main__": stand()
在已有样本足够多的情况下比较稳定,适合现代嘈杂大数据场景。
sklearn缺失值API: sklearn.preprocessing.Imputer
mputer(missing_values=‘NaN’, strategy=‘mean’, axis=0)
完成缺失值插补
Imputer.fit_transform(X,y)
X:numpy array格式的数据[n_samples,n_features]
返回值:转换后的形状相同的array
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。