运行环境:jupyter notebook
语言:python
首先导入一些包

import numpy as np
import pandas as pd
import jieba,time
import gensim
from sklearn.preprocessing import StandardScaler
import seaborn as sns

读取数据文件:

train_df = pd.read_csv(open('baoyu_train.csv' , 'r', encoding='utf-8'))
train_df = train_df.fillna(train_df.mean())
train_df.head()

一、jieba分词

分词并打印分词时间以便观察进程

train_df.columns = ['分类', '文章']
#stopword_list = [k.strip() for k in open('stopwords.txt', encoding='utf-8').readlines() if k.strip() != '']
#上面的语句不建议这么写,因为readlines()是一下子将所有内容读入内存,如果文件过大,会很耗内存,建议这么写
stopword_list = [k.strip() for k in open('stopwords.txt', encoding='utf-8') if k.strip() != '']
 
cutWords_list = []
 
i = 0
startTime = time.time()
for article in train_df['文章']:
    cutWords = [k for k in jieba.cut(article) if k not in stopword_list]
    i += 1
    if i % 1000 == 0:
        print('前%d篇文章分词共花费%.2f秒' % (i, time.time() - startTime))
    cutWords_list.append(cutWords)

去除停用词:

with open('cutWords_list.txt', 'w',encoding='utf-8') as file:
    for cutWords in cutWords_list:
        file.write(' '.join(cutWords) + '\n')
with open('cutWords_list.txt',encoding='utf-8') as file:
    cutWords_list = [ k.split() for k in file ]

二、Word2vec生成词向量

import warnings
warnings.filterwarnings('ignore')
from gensim.models import Word2Vec
word2vec_model = Word2Vec(cutWords_list, size=100, iter=10, min_count=0)

查看相近词:

word2vec_model.wv.most_similar('救命')

保存模型:

word2vec_model.save( 'word2vec_model.w2v' )

生成词向量并打印时间:

def getVector_v4(cutWords, word2vec_model):
        i = 0
        index2word_set = set(word2vec_model.wv.index2word)
        article_vector = np.zeros((word2vec_model.layer1_size))
        for cutWord in cutWords:
                if cutWord in index2word_set:
                        article_vector = np.add(article_vector, word2vec_model.wv[cutWord])
                        i += 1
        cutWord_vector = np.divide(article_vector, i)
        return cutWord_vector
 
startTime = time.time()
vector_list = []
i = 0
for cutWords in cutWords_list[:7484]:
        i += 1
        if i % 1000 == 0:
                print('前%d篇文章形成词向量花费%.2f秒' %(i, time.time()-startTime))
 
        vector_list.append( getVector_v4(cutWords, word2vec_model) )
 
X = np.array(vector_list)
print('Total Time You Need To Get X:%.2f秒' % (time.time() - startTime) )

看一下X和y长度是否匹配:

from sklearn.preprocessing import LabelEncoder
train_df = pd.read_csv(open('baoyu_train.csv' , 'r', encoding='utf-8'))
train_df.columns = ['分类', '文章']
labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(train_df['分类'])
print(X[12])
print(X.shape)
print(y.shape)

三、逻辑回归(LR)

首先划分训练集验证集:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
logistic_model = LogisticRegression()                                                                                                                                                      
logistic_model.fit(train_X, train_y)
logistic_model.score(test_X, test_y)

交叉验证:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv_split = ShuffleSplit(n_splits=10, train_size=0.7, test_size=0.3)
logistic_model = LogisticRegression()
score_ndarray = cross_val_score(logistic_model, X, y, cv=cv_split)
print(score_ndarray)
print(score_ndarray.mean())

保存模型:

import joblib
joblib.dump(logistic_model, 'logistic.model')
 
#加载模型
logistic_model = joblib.load('logistic.model')

查看混淆矩阵结果:

from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred = logistic_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))

四、K近邻(KNN)

from sklearn.neighbors import KNeighborsClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train,y_train)  #喂数据都要给训练集
knn_model.predict(X_test)  #进行预测
knn_model.score(X_test,y_test)  #查看预测的精确度
import joblib
joblib.dump(knn_model, 'knn.model')
 
#加载模型
knn_model = joblib.load('knn.model')
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred =knn_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))

五、朴素贝叶斯(NB)

from sklearn.naive_bayes import GaussianNB
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
nb_model = GaussianNB()
nb_model.fit(X_train,y_train)
nb_model.score(X_test,y_test)
import joblib
joblib.dump(nb_model, 'nb.model')
 
#加载模型
nb_model = joblib.load('nb.model')
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred =nb_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))

六、SVM支持向量机

from sklearn.svm import SVC
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
svm_model = SVC(kernel='rbf', verbose=True, probability=True)
svm_model.fit(X_train,y_train)
svm_model.score(X_test,y_test)
import joblib
joblib.dump(svm_model, 'svm.model')
 
#加载模型
svm_model = joblib.load('svm.model')
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred =svm_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))

七、极端梯度下降(XGBoost)

from xgboost import XGBClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)
xgb_model.score(X_test,y_test)
import joblib
joblib.dump(xgb_model, 'xgb.model')
 
#加载模型
xgb_model = joblib.load('xgb.model')
from sklearn.metrics import classification_report
test_df = test_df = pd.read_csv(open('baoyu_test.csv' , 'r', encoding='utf-8'))
test_df.columns = ['分类', '文章']
test_label = labelEncoder.transform(test_df['分类'])
y_pred =xgb_model.predict( getVectorMatrix(test_df['文章']) )
print(labelEncoder.inverse_transform([[x] for x in range(2)]))
print(classification_report(test_label, y_pred))

八、ROC曲线绘制

from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc 
plt.figure()
#只需将模型添加到列表中,就可以在一个图中绘制多条ROC曲线。
# Add the models to the list that you want to view on the ROC plot
models = [
{
    'label': 'LR',
    'model': LogisticRegression(),
},
{
    'label': 'KNN',
    'model': KNeighborsClassifier(),
},
{
    'label': 'NB',
    'model': GaussianNB(),
},
{
    'label': 'SVM',
    'model': SVC(probability=True),
},
{
    'label': 'XGBoost',
    'model': XGBClassifier(),
}
]

# Below for loop iterates through your models list
for m in models:
    model = m['model'] # select the model
    model.fit(X_train, y_train) # train the model
    y_pred=model.predict(X_test) # predict the test data
# Compute False postive rate, and True positive rate
    fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
# Calculate Area under the curve to display on the plot
    auc = metrics.roc_auc_score(y_test,model.predict(X_test))
# Now, plot the computed values
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
# Custom settings for the plot 
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()   # Display
Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐