机器学习(二)特征工程
Python 机器学习2018年3天快速入门python机器学习【黑马程序员】(二)特征工程1.字典特征抽取from sklearn.feature_extraction import DictVectorizerdef dict_demo():'''字典特征抽取:return:'''data = [{'city': '北京', '...
·
Python 机器学习
(二)特征工程
1.字典特征抽取
from sklearn.feature_extraction import DictVectorizer
def dict_demo():
'''
字典特征抽取
:return:
'''
data = [{'city': '北京', 'temperature': 100},
{'city': '上海', 'temperature': 60},
{'city': '深圳', 'temperature': 30}]
# 1.实例化一个转化器类
transfer = DictVectorizer()
# 2.调用
data_new = transfer.fit_transform(data)
print('data_new:\n', data_new)
print('特征名字: \n', transfer.get_feature_names())
return None
if __name__ == '__main__':
dict_demo()
data_new:
(0, 1) 1.0
(0, 3) 100.0
(1, 0) 1.0
(1, 3) 60.0
(2, 2) 1.0
(2, 3) 30.0
特征名字:
['city=上海', 'city=北京', 'city=深圳', 'temperature']
Process finished with exit code 0
这里输出的类型是sparse矩阵类型
- sparse矩阵:稀疏矩阵,指出不为零的数的坐标
- 可以在参数中增加sparse=False来得到更清晰的结果(通常不这样做)
下面展示一下非稀疏矩阵的输出效果
transfer = DictVectorizer(sparse=False) # sparse默认为True,返回一个稀疏矩阵,仅指出有值的位置(坐标)
data_new:
[[ 0. 1. 0. 100.]
[ 1. 0. 0. 60.]
[ 0. 0. 1. 30.]]
特征名字:
['city=上海', 'city=北京', 'city=深圳', 'temperature']
2.文本特征抽取
通用流程:
- 1.实例化一个转换器类
- 2.调用 fit_transform
2.1 CountVectorizer
2.1.1 英文文本.通过空格区分单词
from sklearn.feature_extraction.text import CountVectorizer
def count_demo():
'''
文本特征抽取: CountVectorizer
:return:
'''
data = ["life is short,i like like python", "life is too long,i dislike python"]
# 实例化一个转换器类
transfer = CountVectorizer()
# 2.调用 fit_transform
data_new = transfer.fit_transform(data)
print('data_new:\n', data_new.toarray())
print('特征名字:\n', transfer.get_feature_names())
return None
if __name__ == '__main__':
count_demo()
data_new:
[[0 1 1 2 0 1 1 0]
[1 1 1 0 1 1 0 1]]
特征名字:
['dislike', 'is', 'life', 'like', 'long', 'python', 'short', 'too']
Process finished with exit code 0
2.1.2 中文文本判断
from sklearn.feature_extraction.text import CountVectorizer
def count_chinese_demo():
'''
中文文本特征抽取: CountVectorizer
:return:
'''
data = ["我爱 北京 天安门", '天安门 上 太阳 升']
# 实例化一个转换器类
transfer = CountVectorizer()
# 2.调用 fit_transform
data_new = transfer.fit_transform(data)
print('data_new:\n', data_new.toarray())
print('特征名字:\n', transfer.get_feature_names())
return None
if __name__ == '__main__':
count_chinese_demo()
data_new:
[[1 1 0 1]
[0 1 1 0]]
特征名字:
['北京', '天安门', '太阳', '我爱']
Process finished with exit code 0
但是这是我们手动加空格才可以使得它区分的,那么有没有什么办法可以自动给中文分词呢,下面就使用jieba库(需要安装)
2.1.3 中文文本自动分词
from sklearn.feature_extraction.text import CountVectorizer
import jieba
def cut_word(text):
'''
运用中文分词'我爱北京天安门' ---> '我 爱 北京 天安门'
:param text:
:return:
'''
a = jieba.cut(text) # 返回一个生成器
a = ' '.join(list(a))
return a
def count_chinese_demo2():
'''
中文文本特征抽取,自动分词
:return:
'''
data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
"我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
"如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
# 1. 中文文本分词 并且放入data_new中
# data_new = []
# for sent in data:
# data_new.append(cut_word(sent))
data_new = [cut_word(sent) for sent in data]
# 2.实例化一个转换器类
# stop_words 可以指定停用哪些词,及不成为特征词
transfer = CountVectorizer(stop_words=['一种', '所以'])
# 3.调用fit_transform
data_final = transfer.fit_transform(data_new)
print('data_new:\n', data_final.toarray())
print('特征名字:\n', transfer.get_feature_names())
return None
if __name__ == '__main__':
count_chinese_demo2()
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\14360\AppData\Local\Temp\jieba.cache
Loading model cost 0.668 seconds.
Prefix dict has been built successfully.
data_new:
[[0 1 0 0 0 2 0 0 0 0 0 1 0 1 0 0 0 0 1 0 2 0 1 0 2 1 0 0 0 1 1 0 0 1 0]
[0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 3 0 0 0 1 0 0 0 0 2 0 0 0 0 0 1 0 1]
[1 0 0 4 3 0 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 2 1 0 0 1 0 0 0]]
特征名字:
['不会', '不要', '之前', '了解', '事物', '今天', '光是在', '几百万年', '发出', '取决于', '只用', '后天', '含义', '大部分', '如何', '如果', '宇宙', '我们', '放弃', '方式', '明天', '星系', '晚上', '某样', '残酷', '每个', '看到', '真正', '秘密', '绝对', '美好', '联系', '过去', '还是', '这样']
Process finished with exit code 0
2.2 TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
def cut_word(text):
'''
运用中文分词'我爱北京天安门' ---> '我 爱 北京 天安门'
:param text:
:return:
'''
a = jieba.cut(text) # 返回一个生成器
a = ' '.join(list(a))
return a
def tfidf_demo():
'''
用TF-IDF的方法进行文本特征抽取
:return:
'''
data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
"我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
"如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
# 1. 中文文本分词 并且放入data_new中
data_new = [cut_word(sent) for sent in data]
# 2.实例化一个转换器类
# stop_words 可以指定停用哪些词,及不成为特征词
transfer = TfidfVectorizer(stop_words=['一种', '所以'])
# 3.调用fit_transform
data_final = transfer.fit_transform(data_new)
print('data_new:\n', data_final.toarray())
print('特征名字:\n', transfer.get_feature_names())
return None
if __name__ == '__main__':
tfidf_demo()
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\14360\AppData\Local\Temp\jieba.cache
Loading model cost 0.721 seconds.
Prefix dict has been built successfully.
data_new:
[[0. 0.21821789 0. 0. 0. 0.43643578
0. 0. 0. 0. 0. 0.21821789
0. 0.21821789 0. 0. 0. 0.
0.21821789 0. 0.43643578 0. 0.21821789 0.
0.43643578 0.21821789 0. 0. 0. 0.21821789
0.21821789 0. 0. 0.21821789 0. ]
[0. 0. 0.2410822 0. 0. 0.
0.2410822 0.2410822 0.2410822 0. 0. 0.
0. 0. 0. 0. 0.2410822 0.55004769
0. 0. 0. 0.2410822 0. 0.
0. 0. 0.48216441 0. 0. 0.
0. 0. 0.2410822 0. 0.2410822 ]
[0.15895379 0. 0. 0.63581516 0.47686137 0.
0. 0. 0. 0.15895379 0.15895379 0.
0.15895379 0. 0.15895379 0.15895379 0. 0.12088845
0. 0.15895379 0. 0. 0. 0.15895379
0. 0. 0. 0.31790758 0.15895379 0.
0. 0.15895379 0. 0. 0. ]]
特征名字:
['不会', '不要', '之前', '了解', '事物', '今天', '光是在', '几百万年', '发出', '取决于', '只用', '后天', '含义', '大部分', '如何', '如果', '宇宙', '我们', '放弃', '方式', '明天', '星系', '晚上', '某样', '残酷', '每个', '看到', '真正', '秘密', '绝对', '美好', '联系', '过去', '还是', '这样']
Process finished with exit code 0
3.特征预处理
3.1 归一化
代码:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
def minmax_demo():
'''
归一化:稳定性较差,如最大值最小值缺失时
:return:
'''
data = pd.read_csv('dating.txt')
data = data.iloc[:, :3] # 取前三列,因为最后一列是不需要的数据
# print('data:\n', data)
# 实例化一个转换器类 feature_range设置范围,默认为0,1
# 计算公式 x - min / max - min min,max为列的最值
transfer = MinMaxScaler(feature_range=[2, 3])
data_new = transfer.fit_transform(data)
print('data_new:\n', data_new)
return None
if __name__ == '__main__':
minmax_demo()
data_new:
[[2.44832535 2.39805139 2.56233353]
[2.15873259 2.34195467 2.98724416]
[2.28542943 2.06892523 2.47449629]
...
[2.29115949 2.50910294 2.51079493]
[2.52711097 2.43665451 2.4290048 ]
[2.47940793 2.3768091 2.78571804]]
Process finished with exit code 0
3.2 标准化
from sklearn.preprocessing import StandardScaler
import pandas as pd
def stand_demo():
'''
标准化: 较为准确,出现少量的异常点影响不大 x - mean / 标准差(std)
:return:
'''
data = pd.read_csv('dating.txt')
data = data.iloc[:, :3]
# 实例化一个转换器类
transfer = StandardScaler()
data_new = transfer.fit_transform(data)
print('data_new:\n', data_new)
return None
if __name__ == '__main__':
stand_demo()
data_new:
[[ 0.33193158 0.41660188 0.24523407]
[-0.87247784 0.13992897 1.69385734]
[-0.34554872 -1.20667094 -0.05422437]
...
[-0.32171752 0.96431572 0.06952649]
[ 0.65959911 0.60699509 -0.20931587]
[ 0.46120328 0.31183342 1.00680598]]
Process finished with exit code 0
4.特征降维
4.1 过滤方差特征
相关系数计算公式:
通过scipy库中的pearsonr计算,返回的第一个值就是相关系数
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
# 特征降维: 对象:二维数组
# 降低特征个数,得到一组不相关()的主变量的过程
def variance_demo():
'''
过滤方差特征
:return:
'''
# 1.获取数据
data = pd.read_csv('factor_returns.csv')
print('data:\n', data)
data = data.iloc[:, 1:-2] # 选择要的部分
# 2.实例化一个转换器类
transfer = VarianceThreshold()
# transfer = VarianceThreshold(threshold=10) # 设置阈值,过滤方差在10以内的特征,默认为0
# 3.调用fit_transform
data_new = transfer.fit_transform(data)
print('data_new:\n', data_new, data_new.shape)
# 计算某两个变量之间的相关系数
r1 = pearsonr(data['pe_ratio'], data['pb_ratio'])
print('相关系数:\n', r1) # 返回的第一个值是相关性的值
r2 = pearsonr(data['revenue'], data['total_expense'])
print('相关系数:\n', r2)
# 画个图
plt.figure(figsize=(20, 8), dpi=100) # 设置大小
plt.scatter(data['revenue'], data['total_expense']) # 绘制散点图,设置x,y坐标
plt.show() # 展示
return None
if __name__ == '__main__':
variance_demo()
data:
index pe_ratio pb_ratio ... total_expense date return
0 000001.XSHE 5.9572 1.1818 ... 1.088254e+10 2012-01-31 0.027657
1 000002.XSHE 7.0289 1.5880 ... 2.378348e+10 2012-01-31 0.082352
2 000008.XSHE -262.7461 7.0003 ... 1.203008e+07 2012-01-31 0.099789
3 000060.XSHE 16.4760 3.7146 ... 7.935543e+09 2012-01-31 0.121595
4 000069.XSHE 12.5878 2.5616 ... 7.091398e+09 2012-01-31 -0.002681
... ... ... ... ... ... ... ...
2313 601888.XSHG 25.0848 4.2323 ... 1.041419e+10 2012-11-30 0.060727
2314 601901.XSHG 59.4849 1.6392 ... 1.089783e+09 2012-11-30 0.179148
2315 601933.XSHG 39.5523 4.0052 ... 1.749295e+10 2012-11-30 0.137134
2316 601958.XSHG 52.5408 2.4646 ... 6.009007e+09 2012-11-30 0.149167
2317 601989.XSHG 14.2203 1.4103 ... 4.132842e+10 2012-11-30 0.183629
[2318 rows x 12 columns]
data_new:
[[ 5.95720000e+00 1.18180000e+00 8.52525509e+10 ... 2.01000000e+00
2.07014010e+10 1.08825400e+10]
[ 7.02890000e+00 1.58800000e+00 8.41133582e+10 ... 3.26000000e-01
2.93083692e+10 2.37834769e+10]
[-2.62746100e+02 7.00030000e+00 5.17045520e+08 ... -6.00000000e-03
1.16798290e+07 1.20300800e+07]
...
[ 3.95523000e+01 4.00520000e+00 1.70243430e+10 ... 2.20000000e-01
1.78908166e+10 1.74929478e+10]
[ 5.25408000e+01 2.46460000e+00 3.28790988e+10 ... 1.21000000e-01
6.46539204e+09 6.00900728e+09]
[ 1.42203000e+01 1.41030000e+00 5.91108572e+10 ... 2.47000000e-01
4.50987171e+10 4.13284212e+10]] (2318, 9)
相关系数:
(-0.004389322779936271, 0.8327205496564927)
相关系数:
(0.9958450413136115, 0.0)
4.2 主成分分析
from sklearn.decomposition import PCA
def pca_demo():
'''
PCA降维
:return:
'''
data = [[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]]
# 1.实例化一个转换器类
# n_components:
# 小数 表示保留百分之多少
# 整数 减少到多少特征(几维)
transfer = PCA(n_components=2)
# 2.调用fit_transform
data_new = transfer.fit_transform(data)
print('data_new:\n', data_new)
return None
if __name__ == '__main__':
pca_demo()
data_new:
[[ 1.28620952e-15 3.82970843e+00]
[ 5.74456265e+00 -1.91485422e+00]
[-5.74456265e+00 -1.91485422e+00]]
Process finished with exit code 0
更多推荐
已为社区贡献6条内容
所有评论(0)