英文词向量：word2vec之skip-gram实现（不使用框架实现词向量模型）

代码中使用的语料：链接：https://pan.baidu.com/s/1nu_TqPtstB8brL2jJ-WtAg提取码：nwdo之前讲了中文的词向量如何训练，这里说说英文的，英语比中文简单多了，也不会有乱码问题，所以这里直接贴出代码。怎么跑起来可以看之前的文章#!/usr/bin/endimension python# -#-coding:utf-8 -*-# author:by ucas

Richard_Kim

702人浏览 · 2021-10-27 15:17:58

Richard_Kim · 2021-10-27 15:17:58 发布

代码中使用的语料：

链接：https://pan.baidu.com/s/1nu_TqPtstB8brL2jJ-WtAg
提取码：nwdo

之前讲了中文的词向量如何训练，这里说说英文的，英语比中文简单多了，也不会有乱码问题，所以这里直接贴出代码。怎么跑起来可以看之前的文章

https://blog.csdn.net/Richard_Kim/article/details/120992735


#!/usr/bin/endimension python
# -#-coding:utf-8 -*-
# author:by ucas vv
# datetime:2021/10/21 11:24:08
# software:PyCharm


"""
    1.下载数据or找个实验数据,进行数据预处理
    2.将原词汇数据转换为字典映射，求三大参数，即word2index,index2word,word2one-hot
    3.为 skip-gram模型 建立一个扫描器
    4.建立并训练 skip-gram 模型
    5.开始训练模型
    6.结果可视化
"""

import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from tqdm import tqdm, trange
import nltk

nltk.download('punkt')

# 这里换成对应的路径
with open("data/enDemoTest2", errors='ignore') as f:
    Data = f.read()
    print(f)
    print(Data[0:10])
    print("Data length=", len(Data))

# 把所有字母转成小写字母
Data = Data.lower()
# 去除字符中的所有数字
Data = re.sub(r"\b\d+\b", '', Data)
# 去除所有的特殊字符，只保留字母
Data = re.sub(r'[^A-Za-z0-9 ]+', '', Data)

"""
    word_tokenize()
    text  =  nltk.word_tokenize( "And now for something completely different" )
    print (text)
    ['And', 'now', 'for', 'something', 'completely', 'different']
"""
words = nltk.word_tokenize(Data)
print("words=", words)

# 生成 word2index和index2word的字典
word2index = {}
index2word = {}
# 计数器
count = 0
# 遍历所有词集合 生成词到索引的字典
for word in words:
    # 这个词索引是不重复的
    if word not in word2index.keys():
        word2index[word] = count
        count = count + 1

# 从前面词的索引表反向生成索引到词的字典
for word in word2index.keys():
    index2word[word2index[word]] = word

print("word2index len", len(word2index))
print("index2word len", len(index2word))
print("word2index=", word2index)
print("index2word=", index2word)


# 返回一个词表
def getWindows(words, window_size):
    Dataset = []
    for i in range(len(words)):
        # 取前后n个单词
        for j in range(i - window_size, i + window_size + 1, 1):
            # print("j=", j)
            if j < 0 or j > (len(words) - 1) or j == i:
                continue
            Dataset.append((words[i], words[j]))
    return Dataset


# 生成词的ont-hot向量
def getOneHotwordvec(word, word2index):
    """
       :param word: 输入的一个词，比如输入一个“政界”
       :param word2index: 字面意思
       :return: 返回输入词所对应的ont-hot向量
       """
    onehotWordvec = np.zeros(shape=(len(word2index), 1))  # 行为所有词的长度，列为1
    onehotWordvec[word2index[word]][0] = 1  # 对应词的索引所对的值赋值为1
    return onehotWordvec


# 测试ont-hot向量生成
word = 'study'


# print(getOneHotwordvec(word, word2index))


# print("get word context dataset = ", getWindows(words, 3)[5])


# 生成训练集
def getTrain(words, window_size, word2index):
    X_train, y_train = [], []
    Dataset = getWindows(words, window_size)
    # print("Dataset=",Dataset)
    batch_size = 100
    for i in trange(1, 100000, batch_size):
        # 每个中心词和上下文词都在当前窗口内
        for centre_word, context_word in tqdm(Dataset[i: i + batch_size - 1]):
            # 拿到x的one-hot向量
            X_train.append(getOneHotwordvec(centre_word, word2index))
            y_train.append(getOneHotwordvec(context_word, word2index))
            # print("X_train", X_train)
            # print("y_train", y_train)
    return X_train, y_train


X_train, y_train = getTrain(words, 3, word2index)
print("X_train len=", len(X_train))
print("y_train len=", len(y_train))

X_train = np.array(X_train)
print("x转换成array完成")
y_train = np.array(y_train)
print("y转换成array完成")

print("X_train array shape=", X_train.shape)
print("y_train array shape=", y_train.shape)

# X_train = X_train.reshape(11398, 99000)
# X_train = X_train.reshape(1063, 99000)
# X_train = X_train.reshape(4498, 61811)
X_train = X_train.reshape(1049, 14012)
print("reshape之后X_train.shape=", X_train.shape)
print("X_train=", X_train)

# y_train = y_train.reshape(11398, 99000)
# y_train = y_train.reshape(1063, 99000)
# y_train = y_train.reshape(4498, 61811)
y_train = y_train.reshape(1049, 14012)
print("reshape之后y_train.shape=", y_train.shape)
print("y_train=", y_train)


# 初始化权重矩阵，有个两个矩阵，分别是VxD 和 DxV，为什么这么定义，参考论文的原理
def weightInit(dimension, pDimension):
    # 第一层矩阵初始化
    W1 = np.random.randn(pDimension, dimension)
    # 第一层偏置初始化
    b1 = np.random.randn(pDimension, 1)
    # 第二层矩阵初始化,维度和是第一个矩阵的转置
    W2 = np.random.randn(dimension, pDimension)
    # 第二层偏置初始化
    b2 = np.random.randn(dimension, 1)
    return W1, b1, W2, b2


# relu激活函数
def relu(z):
    return np.maximum(0, z)


# softmax函数
def softmax(z):
    ex = np.exp(z)
    return ex / np.sum(ex, axis=0)


# 前向传播
def forward(x, W1, b1, W2, b2):
    # 第一层前向传播即 权重与输入的x的乘积加上偏置
    Z1 = np.dot(W1, x) + b1
    # 添加一个rule函数
    Z1 = relu(Z1)
    # 第二层前向传播即 上一层的输出作为当前层的输入与当前权重的乘积,再加上第二层的偏置值
    Z2 = np.dot(W2, Z1) + b2
    # 将第二层的输出值经过softmax函数，得到概率值
    ypred = softmax(Z2)
    return Z1, Z2, ypred


# 误差计算
def errorCalculation(y, ypred, m):
    error = -(np.sum(np.multiply(y, np.log(ypred)))) / m
    return error


# 反向传播
def backProp(W1, b1, W2, b2, Z1, Z2, y, ypred, x):
    dW1 = np.dot(relu(np.dot(W2.T, ypred - y)), x.T)
    db1 = relu(np.dot(W2.T, ypred - y))
    dW2 = np.dot(ypred - y, Z1.T)
    db2 = ypred - y
    return dW1, db1, dW2, db2


# 模型训练
def model(x, y, epoches=10, learning_rate=0.00001):
    # x的行列数
    dimension = x.shape[0]
    m = x.shape[1]
    # 生成词向量维度，300表示生成的词向量用300行数字表示这个词
    pDimension = 300
    W1, b1, W2, b2 = weightInit(dimension, pDimension)
    error = []
    for i in tqdm(range(epoches)):
        Z1, Z2, ypred = forward(x, W1, b1, W2, b2)
        error.append(errorCalculation(y, ypred, m))
        dW1, db1, dW2, db2 = backProp(W1, b1, W2, b2, Z1, Z2, y, ypred, x)
        # 更新权重和偏置
        W1 = W1 - learning_rate * dW1
        b1 = b1 - learning_rate * db1
        W2 = W2 - learning_rate * dW2
        b2 = b2 - learning_rate * db2
        print("error=", error)
    return ypred, error, W1, W2


# 传入训练集和epoch次数，以及学习率
ypred, error, W1, W2 = model(X_train, y_train, 10, 0.00001)
# 把中心词向量矩阵和周围词向量矩阵相加求平均即为词向量矩阵
W = np.add(W1, W2.T) / 2

# 生成词嵌入字典，即{单词1:词向量1,单词2:词向量2...}的格式
word2vec = {}
for word in word2index.keys():
    # 词向量矩阵中某个词的索引所对应的那一列即为所该词的词向量
    word2vec[word] = W[:, word2index[word]]

print("word2vec=", word2vec)

"""
    继续降维，比如food，food转为词向量后
    'food': array([-2.17007321e-01, -5.89359459e-01,  1.02294753e+00,  1.90721062e+00,
       -1.81680167e-01,  6.68201929e-01,  9.83180179e-01, -1.81787765e-01,
        7.74756085e-01,  1.98217257e-01, -2.89414012e-01,  5.14236711e-01,
        ......省略70多行
        2.64303636e-01, -4.24010150e-01,  2.40863888e-01, -5.55736922e-01,
        7.84205414e-01, -6.04794130e-01,  2.40553756e-01, -1.19349800e-01,
        5.47898499e-02,  5.63425593e-01,  9.52913677e-02, -1.09280454e+00]),
        维度还是非常高，把其降为2个维度，方便在二维空间画出点。降维的原理查看下面知乎回答
        https://zhuanlan.zhihu.com/p/77151308
"""
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(W.T)

# 降维后在生成一个词嵌入字典，即即{单词1:(维度一，维度二),单词2:(维度一，维度二)...}的格式
word2ReduceDimensionVec = {}
for word in word2index.keys():
    word2ReduceDimensionVec[word] = principalComponents[word2index[word], :]

# 将生成的字典写入到文件中
with open("en_wordvec2.txt", 'w') as f:
    for key in word2index.keys():
        f.write('\n')
        f.writelines('"' + str(key) + '":' + str(word2vec[key]))
    f.write('\n')

# 将词向量可视化
plt.figure(figsize=(20, 20))
# 只画出1000个，太多显示效果很差
count = 0
for word, wordvec in word2ReduceDimensionVec.items():
    if count < 1000:
        plt.scatter(wordvec[0], wordvec[1])
        plt.annotate(word, (wordvec[0], wordvec[1]))
        count += 1
plt.show()

最后结果