# RNN在自然语言处理中的应用及其PyTorch实现

（1）The cat likes playing ball. （2）The kitty likes playing wool. （3）The dog likes playing ball. （4）The boy likes playing ball.

PyTorch 中的词嵌入是通过函数nn.Embedding(m, n) 来实现的，其中m 表示所有的单词数目，n 表示词嵌入的维度，下面举一个例子：

1 word_to_ix = {'hello': 0, 'world': 1} 2 embeds = nn.Embedding(2, 5) 3 hello_idx = torch.LongTensor([word_to_ix['hello']]) 4 hello_idx = Variable(hello_idx) 5 hello_embed = embeds(hello_idx) 6 print(hello_embed)

N Gram 模型

1 CONTEXT_SIZE = 2 2 EMBEDDING_DIM = 10 3 # We will use Shakespeare Sonnet 2 4 test_sentence = """When forty winters shall besiege thy brow, 5 And dig deep trenches in thy beauty's field, 6 Thy youth's proud livery so gazed on now, 7 Will be a totter'd weed of small worth held: 8 Then being asked, where all thy beauty lies, 9 Where all the treasure of thy lusty days; 10 To say, within thine own deep sunken eyes, 11 Were an all-eating shame, and thriftless praise. 12 How much more praise deserv'd thy beauty's use, 13 If thou couldst answer 'This fair child of mine 14 Shall sum my count, and make my old excuse,' 15 Proving his beauty by succession thine! 16 This were to be new made when thou art old, 17 And see thy blood warm when thou feel'st it cold.""".split()

CONTEXT_SIZE 表示想由前面的几个单词来预测这个单词，这里设置为2，就是说我们希望通过这个单词的前两个单词来预测这一个单词，EMBEDDING_DIM 表示词嵌入的维数。

1 trigram = [((test_sentence[i], test_sentence[i+1]), test_sentence[i+2]) 2 for i in range(len(test_sentence)-2)]

1 vocb = set(test_sentence) # 通过set将重复的单词去掉 2 word_to_idx = {word: i for i, word in enumerate(vocb)} 3 idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

1 class NgramModel(nn.Module): 2 def __init__(self, vocb_size, context_size, n_dim): 3 super(NgramModel, self).__init__() 4 self.n_word = vocb_size 5 self.embedding = nn.Embedding(self.n_word, n_dim) 6 self.linear1 = nn.Linear(context_size*n_dim, 128) 7 self.linear2 = nn.Linear(128, self.n_word) 8 9 def forward(self, x): 10 emb = self.embedding(x) 11 emb = emb.view(1, -1) 12 out = self.linear1(emb) 13 out = F.relu(out) 14 out = self.linear2(out) 15 log_prob = F.log_softmax(out) 16 return log_prob

1 word, label = trigram[3] 2 word = Variable(torch.LongTensor([word_to_idx[i] for i in word])) 3 out = ngrammodel(word) 4 _, predict_label = torch.max(out, 1) 5 predict_word = idx_to_word[predict_label.data[0][0]] 6 print('real word is {}, predict word is {}'.format(label, predict_word))

• 基本原理

• 字符增强

1 training_data = [ 2 ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]), 3 ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]) 4 ]

1 word_to_idx = {} 2 tag_to_idx = {} 3 for context, tag in training_data: 4 for word in context: 5 if word not in word_to_idx: 6 word_to_idx[word] = len(word_to_idx) 7 for label in tag: 8 if label not in tag_to_idx: 9 tag_to_idx[label] = len(tag_to_idx) 10 11 alphabet = 'abcdefghijklmnopqrstuvwxyz' 12 character_to_idx = {} 13 for i in range(len(alphabet)): 14 character_to_idx[alphabet[i]] = i

1 class CharLSTM(nn.Module): 2 def __init__(self, n_char, char_dim, char_hidden): 3 super(CharLSTM, self).__init__() 4 self.char_embedding = nn.Embedding(n_char, char_dim) 5 self.char_lstm = nn.LSTM(char_dim, char_hidden, batch_first=True) 6 7 def forward(self, x): 8 x = self.char_embedding(x) 9 _, h = self.char_lstm(x) 10 return h[0]

1 class LSTMTagger(nn.Module): 2 def __init__(self, n_word, n_char, char_dim, n_dim, char_hidden, 3 n_hidden, n_tag): 4 super(LSTMTagger, self).__init__() 5 self.word_embedding = nn.Embedding(n_word, n_dim) 6 self.char_lstm = CharLSTM(n_char, char_dim, char_hidden) 7 self.lstm = nn.LSTM(n_dim+char_hidden, n_hidden, batch_first=True) 8 self.linear1 = nn.Linear(n_hidden, n_tag) 9 10 def forward(self, x, word_data): 11 word = [i for i in word_data] 12 char = torch.FloatTensor() 13 for each in word: 14 word_list = [] 15 for letter in each: 16 word_list.append(character_to_idx[letter.lower()]) 17 word_list = torch.LongTensor(word_list) 18 word_list = word_list.unsqueeze(0) 19 tempchar = self.char_lstm(Variable(word_list).cuda()) 20 tempchar = tempchar.squeeze(0) 21 char = torch.cat((char, tempchar.cpu().data), 0) 22 char = char.squeeze(1) 23 char = Variable(char).cuda() 24 x = self.word_embedding(x) 25 x = torch.cat((x, char), 1) 26 x = x.unsqueeze(0) 27 x, _ = self.lstm(x) 28 x = x.squeeze(0) 29 x = self.linear1(x) 30 y = F.log_softmax(x) 31 return y

952 篇文章111 人订阅

0 条评论

## 相关文章

6145

3967

2135

### LSTM生成尼采风格文章

github地址 使用循环神经网络生成序列文本数据。循环神经网络可以用来生成音乐、图像作品、语音、对话系统对话等等。

2444

3656

913

3127

971