三种实现逻辑回归算法的代码
方式一:#!/usr/bin/python# -*- coding:utf-8 -*-#鸢尾花分类#1.首先导入包:import numpy as npfrom sklearn.linear_model import LogisticRegressionimport matplotlib.pyplot as pltimport matplotlib as mplfrom sklearn impor
·
了解逻辑回归是解决二分类问题
https://zhuanlan.zhihu.com/p/46591702
需要了解的数学知识,如何用逻辑回归算法来解决分类问题。视频:https://www.bilibili.com/video/BV1yJ411d7xo?p=1
数据集:iris.data
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.0,3.2,1.2,0.2,Iris-setosa
5.5,3.5,1.3,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
4.5,2.3,1.3,0.3,Iris-setosa
4.4,3.2,1.3,0.2,Iris-setosa
5.0,3.5,1.6,0.6,Iris-setosa
5.1,3.8,1.9,0.4,Iris-setosa
4.8,3.0,1.4,0.3,Iris-setosa
5.1,3.8,1.6,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
5.7,2.8,4.5,1.3,Iris-versicolor
6.3,3.3,4.7,1.6,Iris-versicolor
4.9,2.4,3.3,1.0,Iris-versicolor
6.6,2.9,4.6,1.3,Iris-versicolor
5.2,2.7,3.9,1.4,Iris-versicolor
5.0,2.0,3.5,1.0,Iris-versicolor
5.9,3.0,4.2,1.5,Iris-versicolor
6.0,2.2,4.0,1.0,Iris-versicolor
6.1,2.9,4.7,1.4,Iris-versicolor
5.6,2.9,3.6,1.3,Iris-versicolor
6.7,3.1,4.4,1.4,Iris-versicolor
5.6,3.0,4.5,1.5,Iris-versicolor
5.8,2.7,4.1,1.0,Iris-versicolor
6.2,2.2,4.5,1.5,Iris-versicolor
5.6,2.5,3.9,1.1,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
6.1,2.8,4.0,1.3,Iris-versicolor
6.3,2.5,4.9,1.5,Iris-versicolor
6.1,2.8,4.7,1.2,Iris-versicolor
6.4,2.9,4.3,1.3,Iris-versicolor
6.6,3.0,4.4,1.4,Iris-versicolor
6.8,2.8,4.8,1.4,Iris-versicolor
6.7,3.0,5.0,1.7,Iris-versicolor
6.0,2.9,4.5,1.5,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor
5.5,2.4,3.8,1.1,Iris-versicolor
5.5,2.4,3.7,1.0,Iris-versicolor
5.8,2.7,3.9,1.2,Iris-versicolor
6.0,2.7,5.1,1.6,Iris-versicolor
5.4,3.0,4.5,1.5,Iris-versicolor
6.0,3.4,4.5,1.6,Iris-versicolor
6.7,3.1,4.7,1.5,Iris-versicolor
6.3,2.3,4.4,1.3,Iris-versicolor
5.6,3.0,4.1,1.3,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
7.1,3.0,5.9,2.1,Iris-virginica
6.3,2.9,5.6,1.8,Iris-virginica
6.5,3.0,5.8,2.2,Iris-virginica
7.6,3.0,6.6,2.1,Iris-virginica
4.9,2.5,4.5,1.7,Iris-virginica
7.3,2.9,6.3,1.8,Iris-virginica
6.7,2.5,5.8,1.8,Iris-virginica
7.2,3.6,6.1,2.5,Iris-virginica
6.5,3.2,5.1,2.0,Iris-virginica
6.4,2.7,5.3,1.9,Iris-virginica
6.8,3.0,5.5,2.1,Iris-virginica
5.7,2.5,5.0,2.0,Iris-virginica
5.8,2.8,5.1,2.4,Iris-virginica
6.4,3.2,5.3,2.3,Iris-virginica
6.5,3.0,5.5,1.8,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
6.4,2.8,5.6,2.2,Iris-virginica
6.3,2.8,5.1,1.5,Iris-virginica
6.1,2.6,5.6,1.4,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
6.3,3.4,5.6,2.4,Iris-virginica
6.4,3.1,5.5,1.8,Iris-virginica
6.0,3.0,4.8,1.8,Iris-virginica
6.9,3.1,5.4,2.1,Iris-virginica
6.7,3.1,5.6,2.4,Iris-virginica
6.9,3.1,5.1,2.3,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.8,3.2,5.9,2.3,Iris-virginica
6.7,3.3,5.7,2.5,Iris-virginica
6.7,3.0,5.2,2.3,Iris-virginica
6.3,2.5,5.0,1.9,Iris-virginica
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica
方式一:
#!/usr/bin/python
# -*- coding:utf-8 -*-
#鸢尾花分类
#1.首先导入包:
import numpy as np #机器学习基础包
from sklearn.linear_model import LogisticRegression #逻辑回归算法库
import matplotlib.pyplot as plt #绘图工具包
import matplotlib as mpl #绘图地图包
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import datasets #sklearn官方提供的数据集
def iris_type(s):
it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
return it[s]
#2.数据预处理,得到x,y
if __name__ == "__main__":
path = u'iris.data' # 数据文件路径
# # 路径,浮点型数据,逗号分隔,第4列使用函数iris_type单独处理
#data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
# print (data)
# 将数据的0到3列组成x,第4列得到y
#x, y = np.split(data, (4,), axis=1) #axis=1,默认为0,横向切分;为1时,纵向切分。
# 为了可视化,仅使用前两列特征
#x = x[:, :2]
# 用sklearn的数据集
iris = datasets.load_iris()
x = iris.data[:, :2] # we only take the first two features.
y = iris.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=666)
#3.用pipline建立模型
#StandardScaler()作用:去均值和方差归一化。且是针对每一个特征维度来做的,而不是针对样本。 StandardScaler对每列分别标准化。
#PolynomialFeatures(degree=1):进行特征的构造。它是使用多项式的方法来进行的,如果有a,b两个特征,那么它的2次多项式为(1,a,b,a^2,ab, b^2)。PolynomialFeatures有三个参数:
#1.degree:控制多项式的度
#2.interaction_only: 默认为False,如果指定为True,那么就不会有特征自己和自己结合的项,上面的二次项中没有a2和b2。
#3.include_bias:默认为True。如果为True的话,那么就会有上面的 1那一项。
#LogisticRegression()建立逻辑回归模型
lr = Pipeline([('sc', StandardScaler()),
('clf', LogisticRegression(multi_class="multinomial",solver="newton-cg")) ])
lr.fit(X_train,y_train) #ravel将多维数组降位一维,y轴是标签只有一维
#4.画图准备
N, M = 500, 500 # 横纵各采样多少个值
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点
x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点
#5.开始画图
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_hat = lr.predict(x_test) # 预测值
y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同
# print(y_hat)
plt.pcolormesh(x1, x2, y_hat,shading='auto', cmap=cm_light) # 预测值的显示 其实就是背景
plt.scatter(x[:, 0], x[:, 1], c=y.ravel(), edgecolors='k', s=50, cmap=cm_dark) # 样本的显示
plt.xlabel('petal length')
plt.ylabel('petal width')
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.grid()
plt.savefig('2.png')
plt.show()
#6.训练集上的预测结果
y_hat = lr.predict(x) #回归的y
y =y.ravel() #变一维
print(y)
#y = y.reshape(-1) #变一维
#print(y)
result = y_hat == y #回归的y和真实值y比较
print(y_hat)
print(result)
acc = np.mean(result) #求平均数
print('准确率: %.2f%%' % (100 * acc))
方式二:官方例子https://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html#sphx-glr-auto-examples-linear-model-plot-iris-logistic-py
print(__doc__)
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
Y = iris.target
# Create an instance of Logistic Regression Classifier and fit the data.
logreg = LogisticRegression(C=1e5)
logreg.fit(X, Y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.show()
方式三:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
def plot_decision_boundary(model, axis):
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1),
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
iris = datasets.load_iris()
X = iris.data[:,:2] #取二维数据做分析
print(X)
y = iris.target
# random_state是数据随机化
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666) #把所有数据随机拆成俩个部分,(X_train,y_train)训练集;(X_test,y_testce)测试集
log_reg2 = LogisticRegression() #调用LR算法函数
log_reg2.fit(X_train, y_train) #填充数据到LR函数
print('二维数据的准确率:',log_reg2.score(X_test, y_test)) #对比训练数据和测试数据计算准确率
plot_decision_boundary(log_reg2, axis=[4, 8.5, 1.5, 4.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.scatter(X[y==2,0], X[y==2,1])
plt.show()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print('全部数据的准确率:',log_reg.score(X_test, y_test))
方式四:手写的逻辑回归算法
logistic_regression_binary.csv
-0.017612 14.053064 0
-1.395634 4.662541 1
-0.752157 6.538620 0
-1.322371 7.152853 0
0.423363 11.054677 0
0.406704 7.067335 1
0.667394 12.741452 0
-2.460150 6.866805 1
0.569411 9.548755 0
-0.026632 10.427743 0
0.850433 6.920334 1
1.347183 13.175500 0
1.176813 3.167020 1
-1.781871 9.097953 0
-0.566606 5.749003 1
0.931635 1.589505 1
-0.024205 6.151823 1
-0.036453 2.690988 1
-0.196949 0.444165 1
1.014459 5.754399 1
1.985298 3.230619 1
-1.693453 -0.557540 1
-0.576525 11.778922 0
-0.346811 -1.678730 1
-2.124484 2.672471 1
1.217916 9.597015 0
-0.733928 9.098687 0
-3.642001 -1.618087 1
0.315985 3.523953 1
1.416614 9.619232 0
-0.386323 3.989286 1
0.556921 8.294984 1
1.224863 11.587360 0
-1.347803 -2.406051 1
1.196604 4.951851 1
0.275221 9.543647 0
0.470575 9.332488 0
-1.889567 9.542662 0
-1.527893 12.150579 0
-1.185247 11.309318 0
-0.445678 3.297303 1
1.042222 6.105155 1
-0.618787 10.320986 0
1.152083 0.548467 1
0.828534 2.676045 1
-1.237728 10.549033 0
-0.683565 -2.166125 1
0.229456 5.921938 1
-0.959885 11.555336 0
0.492911 10.993324 0
0.184992 8.721488 0
-0.355715 10.325976 0
-0.397822 8.058397 0
0.824839 13.730343 0
1.507278 5.027866 1
0.099671 6.835839 1
-0.344008 10.717485 0
1.785928 7.718645 1
-0.918801 11.560217 0
-0.364009 4.747300 1
-0.841722 4.119083 1
0.490426 1.960539 1
-0.007194 9.075792 0
0.356107 12.447863 0
0.342578 12.281162 0
-0.810823 -1.466018 1
2.530777 6.476801 1
1.296683 11.607559 0
0.475487 12.040035 0
-0.783277 11.009725 0
0.074798 11.023650 0
-1.337472 0.468339 1
-0.102781 13.763651 0
-0.147324 2.874846 1
0.518389 9.887035 0
1.015399 7.571882 0
-1.658086 -0.027255 1
1.319944 2.171228 1
2.056216 5.019981 1
-0.851633 4.375691 1
-1.510047 6.061992 0
-1.076637 -3.181888 1
1.821096 10.283990 0
3.010150 8.401766 1
-1.099458 1.688274 1
-0.834872 -1.733869 1
-0.846637 3.849075 1
1.400102 12.628781 0
1.752842 5.468166 1
0.078557 0.059736 1
0.089392 -0.715300 1
1.825662 12.693808 0
0.197445 9.744638 0
0.126117 0.922311 1
-0.679797 1.220530 1
0.677983 2.556666 1
0.761349 10.693862 0
-2.168791 0.143632 1
1.388610 9.341997 0
0.317029 14.739025 0
# 这是一个手写的逻辑回归算法,解决分类问题
# 网上很多傻逼拿来卖分,还有一些傻逼复制粘贴的看不明白,本人最讨厌这种忽悠
# 分享能运行的代码和注释
import numpy as np #机器算法库
import matplotlib.pyplot as plt #图形绘制库
# sigmod函数,即得分函数,逻辑回归的计算公式;
# 值域在0,1之间;
# 来计算数据的概率是0还是1;得到y大于等于0.5是1,y小于等于0.5为0
def sigmod(x):
return 1/(1+np.exp(-x))
# 损失函数,损失函数的计算公式;
# hx是概率估计值,是sigmod(x)得来的值
# y是样本真值
def cost(hx, y):
return -y * np.log(hx) - (1-y) * np.log(1-hx)
# 梯度下降函数
def gradient(current_para, x, y, learning_rate):
m = len(y)
matrix_gradient = np.zeros(len(x[0]))
for i in range(m):
current_x = x[i]
current_y = y[i]
current_x = np.asarray(current_x)
matrix_gradient += (sigmod(np.dot(current_para, current_x)) - current_y) * current_x
new_para = current_para - learning_rate * matrix_gradient
return new_para
# 误差计算函数
def error(para, x, y):
total = len(y)
error_num = 0
for i in range(total):
current_x = x[i]
current_y = y[i]
hx = sigmod(np.dot(para, current_x)) #LR算法
if cost(hx, current_y) > 0.5: #进一步计算损失
error_num += 1
return error_num/total
# 训练过程
def train(initial_para, x, y, learning_rate, num_iter):
dataMat = np.asarray(x)
labelMat = np.asarray(y)
para = initial_para
for i in range(num_iter+1):
para = gradient(para, dataMat, labelMat, learning_rate) #梯度下降法来
if i % 100 == 0:
err = error(para, dataMat, labelMat)
print("iter:" + str(i) + " ; error:" + str(err))
return para
# 加载数据集
def load_dataset():
dataMat = []
labelMat = []
with open("logistic_regression_binary.csv", "r+") as file_object:
lines = file_object.readlines()
for line in lines:
line_array = line.strip().split()
# 数据矩阵
dataMat.append([1.0, float(line_array[0]), float(line_array[1])])
# 标签矩阵
labelMat.append(int(line_array[2]))
return dataMat, labelMat
# 绘制图形
def plotBestFit(wei, data, label):
if type(wei).__name__ == 'ndarray':
weights = wei
else:
weights = wei.getA()
fig = plt.figure(0)
ax = fig.add_subplot(111)
xxx = np.arange(-3,3,0.1)
yyy = - weights[0]/weights[2] - weights[1]/weights[2]*xxx
ax.plot(xxx,yyy)
cord1 = []
cord0 = []
for i in range(len(label)):
if label[i] == 1:
cord1.append(data[i][1:3])
else:
cord0.append(data[i][1:3])
cord1 = np.array(cord1)
cord0 = np.array(cord0)
ax.scatter(cord1[:, 0], cord1[:, 1], c='red')
ax.scatter(cord0[:, 0], cord0[:, 1], c='green')
plt.show()
def logistic_regression():
x, y = load_dataset()
n = len(x[0])
initial_para = np.ones(n)
learning_rate = 0.001
num_iter = 1000
print("起始参数:")
print(initial_para)
para = train(initial_para, x, y, learning_rate, num_iter)
print("最后训练得到的参数:")
print(para)
plotBestFit(para, x, y)
logistic_regression()
一个商品分类的例子:
逻辑回归做商品分类比朴素叶贝斯要优秀。文章链接:https://mp.weixin.qq.com/s/GUX0BNub7PnAe4AQlvOWNA
1、朴素叶贝斯算法实现商品分类:
https://blog.csdn.net/qq_37797234/article/details/102875919
2、对爬取京东商品按照标题为其进行自动分类---基于逻辑回归的文本分类
https://www.cnblogs.com/lovema1210/p/12510750.html
3、Otto商品分类(二)----Logistic回归预测&超参数调优
https://blog.csdn.net/weixin_38664232/article/details/86916914
更多推荐
已为社区贡献1条内容
所有评论(0)