jupyter notebook使用jieba+word2vec+KNN,LR,NB,SVM,XGBoost进行过文本二分类
运行环境:jupyter notebook语言:python首先导入一些包import numpy as npimport pandas as pdimport jieba,timeimport gensimfrom sklearn.preprocessing import StandardScalerimport seaborn as sns读取数据文件:train_df = pd.read_c
·
运行环境:jupyter notebook
语言:python
首先导入一些包
import numpy as np
import pandas as pd
import jieba,time
import gensim
from sklearn.preprocessing import StandardScaler
import seaborn as sns
读取数据文件:
train_df = pd.read_csv(open('baoyu_train.csv' , 'r', encoding='utf-8'))
train_df = train_df.fillna(train_df.mean())
train_df.head()
一、jieba分词
分词并打印分词时间以便观察进程
train_df.columns = ['分类', '文章']
#stopword_list = [k.strip() for k in open('stopwords.txt', encoding='utf-8').readlines() if k.strip() != '']
#上面的语句不建议这么写,因为readlines()是一下子将所有内容读入内存,如果文件过大,会很耗内存,建议这么写
stopword_list = [k.strip() for k in open('stopwords.txt', encoding='utf-8') if k.strip() != '']
cutWords_list = []
i = 0
startTime = time.time()
for article in train_df['文章']:
cutWords = [k for k in jieba.cut(article) if k not in stopword_list]
i += 1
if i % 1000 == 0:
print('前%d篇文章分词共花费%.2f秒' % (i, time.time() - startTime))
cutWords_list.append(cutWords)
去除停用词:
with open('cutWords_list.txt', 'w',encoding='utf-8') as file:
for cutWords in cutWords_list:
file.write(' '.join(cutWords) + '\n')
with open('cutWords_list.txt',encoding='utf-8') as file:
cutWords_list = [ k.split() for k in file ]
二、Word2vec生成词向量
import warnings
warnings.filterwarnings('ignore')
from gensim.models import Word2Vec
word2vec_model = Word2Vec(cutWords_list, size=100, iter=10, min_count=0)
查看相近词:
word2vec_model.wv.most_similar('救命')
保存模型:
word2vec_model.save( 'word2vec_model.w2v' )
生成词向量并打印时间:
def getVector_v4(cutWords, word2vec_model):
i = 0
index2word_set = set(word2vec_model.wv.index2word)
article_vector = np.zeros((word2vec_model.layer1_size))
for cutWord in cutWords:
if cutWord in index2word_set:
article_vector = np.add(article_vector, word2vec_model.wv[cutWord])
i += 1
cutWord_vector = np.divide(article_vector, i)
return cutWord_vector
startTime = time.time()
vector_list = []
i = 0
for cutWords in cutWords_list[:7484]:
i += 1
if i % 1000 == 0:
print('前%d篇文章形成词向量花费%.2f秒' %(i, time.time()-startTime))
vector_list.append( getVector_v4(cutWords, word2vec_model) )
X = np.array(vector_list)
print('Total Time You Need To Get X:%.2f秒' % (time.time() - startTime) )
看一下X和y长度是否匹配:
from sklearn.preprocessing import LabelEncoder
train_df = pd.read_csv(open('baoyu_train.csv' , 'r', encoding='utf-8'))
train_df.columns = ['分类', '文章']
labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(train_df['分类'])
print(X[12])
print(X.shape)
print(y.shape)
三、逻辑回归(LR)
首先划分训练集验证集:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
logistic_model = LogisticRegression()
logistic_model.fit(train_X, train_y)
logistic_model.score(test_X, test_y)
交叉验证:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv_split = ShuffleSplit(n_splits=10, train_size=0.7, test_size=0.3)
logistic_model = LogisticRegression()
score_ndarray = cross_val_score(logistic_model, X, y, cv=cv_split)
print(score_ndarray)
print(score_ndarray.mean())
保存模型:
import joblib
joblib.dump(logistic_model, 'logistic.model')
#加载模型
logistic_model = joblib.load('logistic.model')
查看混淆矩阵结果:
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred = logistic_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))
四、K近邻(KNN)
from sklearn.neighbors import KNeighborsClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train,y_train) #喂数据都要给训练集
knn_model.predict(X_test) #进行预测
knn_model.score(X_test,y_test) #查看预测的精确度
import joblib
joblib.dump(knn_model, 'knn.model')
#加载模型
knn_model = joblib.load('knn.model')
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred =knn_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))
五、朴素贝叶斯(NB)
from sklearn.naive_bayes import GaussianNB
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
nb_model = GaussianNB()
nb_model.fit(X_train,y_train)
nb_model.score(X_test,y_test)
import joblib
joblib.dump(nb_model, 'nb.model')
#加载模型
nb_model = joblib.load('nb.model')
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred =nb_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))
六、SVM支持向量机
from sklearn.svm import SVC
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
svm_model = SVC(kernel='rbf', verbose=True, probability=True)
svm_model.fit(X_train,y_train)
svm_model.score(X_test,y_test)
import joblib
joblib.dump(svm_model, 'svm.model')
#加载模型
svm_model = joblib.load('svm.model')
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred =svm_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))
七、极端梯度下降(XGBoost)
from xgboost import XGBClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)
xgb_model.score(X_test,y_test)
import joblib
joblib.dump(xgb_model, 'xgb.model')
#加载模型
xgb_model = joblib.load('xgb.model')
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred =xgb_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))
八、ROC曲线绘制
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
plt.figure()
#只需将模型添加到列表中,就可以在一个图中绘制多条ROC曲线。
# Add the models to the list that you want to view on the ROC plot
models = [
{
'label': 'LR',
'model': LogisticRegression(),
},
{
'label': 'KNN',
'model': KNeighborsClassifier(),
},
{
'label': 'NB',
'model': GaussianNB(),
},
{
'label': 'SVM',
'model': SVC(probability=True),
},
{
'label': 'XGBoost',
'model': XGBClassifier(),
}
]
# Below for loop iterates through your models list
for m in models:
model = m['model'] # select the model
model.fit(X_train, y_train) # train the model
y_pred=model.predict(X_test) # predict the test data
# Compute False postive rate, and True positive rate
fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
# Calculate Area under the curve to display on the plot
auc = metrics.roc_auc_score(y_test,model.predict(X_test))
# Now, plot the computed values
plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
# Custom settings for the plot
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show() # Display
更多推荐
已为社区贡献1条内容
所有评论(0)