# NNLM的PyTorch实现

### A Neural Probabilistic Language Model

• C(i)：单词w对应的词向量，其中i为词w在整个词汇表中的索引
• C：词向量，大小为|V|\times m的矩阵
• |V|：词汇表的大小，即预料库中去重后的单词个数
• m：词向量的维度，一般是50到200
• H：隐藏层的weight
• d：隐藏层的bias
• U：输出层的weight
• b：输出层的bias
• W：输入层到输出层的weight
• h：隐藏层神经元个数

1. 首先将输入的n-1个单词索引转为词向量，然后将这n-1个词向量进行concat，形成一个(n-1)*w的向量，用X表示
2. 将X送入隐藏层进行计算，hidden_{out} = tanh(d+X*H)
3. 输出层共有|V|个节点，每个节点y_i表示预测下一个单词i的概率，y的计算公式为y=b+X*W+hidden_{out}*U

### 代码实现（PyTorch）

# code by Tae Hwan Jung @graykode, modify by wmathor
import torch
import torch.nn as nn
import torch.optim as optim

dtype = torch.FloatTensor
sentences = [ "i like dog", "i love coffee", "i hate milk"]

word_list = " ".join(sentences).split() # ['i', 'like', 'dog', 'dog', 'i', 'love', 'coffee', 'i', 'hate', 'milk']
word_list = list(set(word_list)) # ['i', 'like', 'dog', 'love', 'coffee', 'hate', 'milk']
word_dict = {w: i for i, w in enumerate(word_list)} # {'i':0, 'like':1, 'dog':2, 'love':3, 'coffee':4, 'hate':5, 'milk':6}
number_dict = {i: w for i, w in enumerate(word_list)} # {0:'i', 1:'like', 2:'dog', 3:'love', 4:'coffee', 5:'hate', 6:'milk'}
n_class = len(word_dict) # number of Vocabulary, just like |V|, in this task n_class=7

# NNLM(Neural Network Language Model) Parameter
n_step = len(sentences[0].split())-1 # n-1 in paper, look back n_step words and predict next word. In this task n_step=2
n_hidden = 2 # h in paper
m = 2 # m in paper, word embedding dim

def make_batch(sentences):
input_batch = []
target_batch = []

for sen in sentences:
word = sen.split()
input = [word_dict[n] for n in word[:-1]] # [0, 1], [0, 3], [0, 5]
target = word_dict[word[-1]] # 2, 4, 6

input_batch.append(input) # [[0, 1], [0, 3], [0, 5]]
target_batch.append(target) # [2, 4, 6]

return input_batch, target_batch

input_batch, target_batch = make_batch(sentences)
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)

dataset = Data.TensorDataset(input_batch, target_batch)
loader = Data.DataLoader(dataset=dataset, batch_size=16, shuffle=True)
class NNLM(nn.Module):
def __init__(self):
super(NNLM, self).__init__()
self.C = nn.Embedding(n_class, m)
self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))
self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))
self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))
self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))
self.b = nn.Parameter(torch.randn(n_class).type(dtype))

def forward(self, X):
'''
X: [batch_size, n_step]
'''
X = self.C(X) # [batch_size, n_step] => [batch_size, n_step, m]
X = X.view(-1, n_step * m) # [batch_size, n_step * m]
hidden_out = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden]
output = self.b + torch.mm(X, self.W) + torch.mm(hidden_out, self.U) # [batch_size, n_class]
return output

model = NNLM()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

nn.Parameter()的作用是将该参数添加进模型中，使其能够通过model.parameters()找到、管理、并且更新。更具体的来说就是：

1. nn.Parameter()nn.Module一起使用时会有一些特殊的属性，其会被自动加到 Module 的parameters()迭代器中
2. 使用很简单：torch.nn.Parameter(data, requires_grad=True)，其中data为tensor

# Training
for epoch in range(5000):
output = model(batch_x)

# output : [batch_size, n_class], batch_y : [batch_size] (LongTensor, not one-hot)
loss = criterion(output, batch_y)
if (epoch + 1)%1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

loss.backward()
optimizer.step()

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]

# Test
print([sen.split()[:n_step] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

# code by Tae Hwan Jung @graykode, modify by wmathor
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

dtype = torch.FloatTensor

sentences = [ "i like dog", "i love coffee", "i hate milk"]

word_list = " ".join(sentences).split() # ['i', 'like', 'dog', 'dog', 'i', 'love', 'coffee', 'i', 'hate', 'milk']
word_list = list(set(word_list)) # ['i', 'like', 'dog', 'love', 'coffee', 'hate', 'milk']
word_dict = {w: i for i, w in enumerate(word_list)} # {'i':0, 'like':1, 'dog':2, 'love':3, 'coffee':4, 'hate':5, 'milk':6}
number_dict = {i: w for i, w in enumerate(word_list)} # {0:'i', 1:'like', 2:'dog', 3:'love', 4:'coffee', 5:'hate', 6:'milk'}
n_class = len(word_dict) # number of Vocabulary, just like |V|, in this task n_class=7

# NNLM(Neural Network Language Model) Parameter
n_step = len(sentences[0].split())-1 # n-1 in paper, look back n_step words and predict next word. In this task n_step=2
n_hidden = 2 # h in paper
m = 2 # m in paper, word embedding dim

def make_batch(sentences):
input_batch = []
target_batch = []

for sen in sentences:
word = sen.split()
input = [word_dict[n] for n in word[:-1]] # [0, 1], [0, 3], [0, 5]
target = word_dict[word[-1]] # 2, 4, 6

input_batch.append(input) # [[0, 1], [0, 3], [0, 5]]
target_batch.append(target) # [2, 4, 6]

return input_batch, target_batch

input_batch, target_batch = make_batch(sentences)
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)

dataset = Data.TensorDataset(input_batch, target_batch)

class NNLM(nn.Module):
def __init__(self):
super(NNLM, self).__init__()
self.C = nn.Embedding(n_class, m)
self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))
self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))
self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))
self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))
self.b = nn.Parameter(torch.randn(n_class).type(dtype))

def forward(self, X):
'''
X: [batch_size, n_step]
'''
X = self.C(X) # [batch_size, n_step] => [batch_size, n_step, m]
X = X.view(-1, n_step * m) # [batch_size, n_step * m]
hidden_out = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden]
output = self.b + torch.mm(X, self.W) + torch.mm(hidden_out, self.U) # [batch_size, n_class]
return output

model = NNLM()
criterion = nn.CrossEntropyLoss()

# Training
for epoch in range(5000):
output = model(batch_x)

# output : [batch_size, n_class], batch_y : [batch_size] (LongTensor, not one-hot)
loss = criterion(output, batch_y)
if (epoch + 1)%1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

loss.backward()
optimizer.step()

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]

# Test
print([sen.split()[:n_step] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

### 参考文献

A Neural Probabilitic Language Model 论文阅读及实战

NLP-tutorial

0 条评论

• ### Seq2Seq的PyTorch实现

本文介绍一下如何使用 PyTorch 复现 Seq2Seq，实现简单的机器翻译应用，请先简单阅读论文Learning Phrase Representation...

• ### TextRNN的PyTorch实现

参考这篇论文Finding Structure in Time(1990)，如果你对RNN有一定的了解，实际上不用看，仔细看我代码如何实现即可。如果你对RNN不...

• ### BiLSTM的PyTorch应用

本文介绍一下如何使用BiLSTM（基于PyTorch）解决一个实际问题，实现给定一个长句子预测下一个单词

• ### 车载系统概要学习

In-Vehicle Infotainment 简称 IVI，车载娱乐信息系统，是集成于汽车中控台的一 台智能多媒体设备，俗称汽车导航。如今，汽车导航产品越来越...

• ### java 常规类型消息的格式化 原

0.slf4j有一个common logger没有的功能，字符串中的{}会被替换，如下：

• ### 解析|小程序电商和传统电商

从最开始小程序上线，到各种功能的赋能。经过长达一年的用户沉淀以及生态发展，电商小程序的变现能力开始备受关注。

• ### AndroidQ分区存储权限变更及适配的实现

在Android Q中引入了分区储存功能，在外部存储设备中为每个应用提供了一个“隔离存储沙盒”。其他应用无法直接访问应用的沙盒文件。由于文件是应用的私有文件，不...