我正在创建一个captcha图像识别系统。首先利用ResNet对图像进行特征提取,然后利用最小二乘法对图像中的单词和字母进行识别。一个fc层应该将两者连接起来。我以前没有设计过LSTM模型,对机器学习来说也是非常新的,所以我对此感到非常困惑和不知所措。
我很困惑,甚至不完全确定我应该问什么问题。但有几件事我觉得很突出:
中的LSTM信元?
总的来说,如果有任何一般性的建议可供研究,那将是非常感谢的。
到目前为止,我已经:
class LSTM(nn.Module):
def __init__(self, cnn_dim, hidden_size, vocab_size, num_layers=1):
super(LSTM, self).__init__()
self.cnn_dim = cnn_dim #i think this is the input size
self.hidden_size = hidden_size
self.vocab_size = vocab_size #i think this should be the output size
# Building your LSTM cell
self.lstm_cell = nn.LSTMCell(input_size=self.vocab_size, hidden_size=hidden_size)
'''Connect CNN model to LSTM model'''
# output fully connected layer
# CNN does not necessarily need the FCC layers, in this example it is just extracting the features, that gets set to the LSTM which does the actual processing of the features
self.fc_in = nn.Linear(cnn_dim, vocab_size) #this takes the input from the CNN takes the features from the cnn #cnn_dim = 512, hidden_size = 128
self.fc_out = nn.Linear(hidden_size, vocab_size) # this is the looper in the LSTM #I think this is correct?
# embedding layer
self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.vocab_size)
# activations
self.softmax = nn.Softmax(dim=1)
def forward(self, features, captions):
#features: extracted features from ResNet
#captions: label of images
batch_size = features.size(0)
cnn_dim = features.size(1)
hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda() # Initialize hidden state with zeros
cell_state = torch.zeros((batch_size, self.hidden_size)).cuda() # Initialize cell state with zeros
outputs = torch.empty((batch_size, captions.size(1), self.vocab_size)).cuda()
captions_embed = self.embed(captions)
'''Design LSTM model for captcha image recognition'''
# Pass the caption word by word for each time step
# It receives an input(x), makes an output(y), and receives this output as an input again recurrently
'''Defined hidden state, cell state, outputs, embedded captions'''
# can be designed to be word by word or character by character
for t in range(captions).size(1):
# for the first time step the input is the feature vector
if t == 0:
# probably have to get the output from the ResNet layer
# use the LSTM cells in here i presume
x = self.fc_in(features)
hidden_state, cell_state = self.lstm_cell(x[t], (hidden_state, cell_state))
x = self.fc_out(hidden_state)
outputs.append(hidden_state)
# for the 2nd+ time steps
else:
hidden_state, cell_state = self.lstm_cell(x[t], (hidden_state, cell_state))
x = self.fc_out(hidden_state)
outputs.append(hidden_state)
# build the output tensor
outputs = torch.stack(outputs,dim=0)
return outputs
发布于 2022-06-11 02:09:23
nn.Embedding()
通常用于将稀疏的单热向量转换为稠密向量(例如,将a传递到0.1,0.2,.)实际上是为了计算。我不明白为什么你试图嵌入标题,这看上去是真实的。如果您想用它计算损失,请尝试nn.CTCLoss()
.nn.Embedding()
在字符串中嵌入字符,这使得它们非常密集,而且具有计算性。但是,如果LSTM的输入是从CNN (或其他模块)中提取出来的,那么在我看来,它已经非常密集和计算实用,没有必要用fc_in
来投射它们。nn.LSTM()
而不是nn.LSTMCell()
,因为后者是nn.LSTMCell()
。您的代码中有一些bug,我修复了它们:
import torch
from torch import nn
class LSTM(nn.Module):
def __init__(self, cnn_dim, hidden_size, vocab_size, num_layers=1):
super(LSTM, self).__init__()
self.cnn_dim = cnn_dim # i think this is the input size
self.hidden_size = hidden_size
self.vocab_size = vocab_size # i think this should be the output size
# Building your LSTM cell
self.lstm_cell = nn.LSTMCell(input_size=self.vocab_size, hidden_size=hidden_size)
'''Connect CNN model to LSTM model'''
# output fully connected layer
# CNN does not necessarily need the FCC layers, in this example it is just extracting the features, that gets set to the LSTM which does the actual processing of the features
self.fc_in = nn.Linear(cnn_dim,
vocab_size) # this takes the input from the CNN takes the features from the cnn #cnn_dim = 512, hidden_size = 128
self.fc_out = nn.Linear(hidden_size,
vocab_size) # this is the looper in the LSTM #I think this is correct?
# embedding layer
self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.vocab_size)
# activations
self.softmax = nn.Softmax(dim=1)
def forward(self, features, captions):
# features: extracted features from ResNet
# captions: label of images
batch_size = features.size(0)
cnn_dim = features.size(1)
hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda() # Initialize hidden state with zeros
cell_state = torch.zeros((batch_size, self.hidden_size)).cuda() # Initialize cell state with zeros
# outputs = torch.empty((batch_size, captions.size(1), self.vocab_size)).cuda()
outputs = torch.Tensor([]).cuda()
captions_embed = self.embed(captions)
'''Design LSTM model for captcha image recognition'''
# Pass the caption word by word for each time step
# It receives an input(x), makes an output(y), and receives this output as an input again recurrently
'''Defined hidden state, cell state, outputs, embedded captions'''
# can be designed to be word by word or character by character
# for t in range(captions).size(1):
for t in range(captions.size(1)):
# for the first time step the input is the feature vector
if t == 0:
# probably have to get the output from the ResNet layer
# use the LSTM cells in here i presume
x = self.fc_in(features)
# hidden_state, cell_state = self.lstm_cell(x[t], (hidden_state, cell_state))
hidden_state, cell_state = self.lstm_cell(x, (hidden_state, cell_state))
x = self.fc_out(hidden_state)
# outputs.append(hidden_state)
outputs = torch.cat([outputs, hidden_state])
# for the 2nd+ time steps
else:
# hidden_state, cell_state = self.lstm_cell(x[t], (hidden_state, cell_state))
hidden_state, cell_state = self.lstm_cell(x, (hidden_state, cell_state))
x = self.fc_out(hidden_state)
# outputs.append(hidden_state)
outputs = torch.cat([outputs, hidden_state])
# build the output tensor
# outputs = torch.stack(outputs, dim=0)
return outputs
m = LSTM(16, 32, 10)
m = m.cuda()
features = torch.randn((2, 16))
features = features.cuda()
captions = torch.randn((2, 10))
captions = torch.clip(captions, 0, 9)
captions = captions.long()
captions = captions.cuda()
m(features, captions)
这篇文章可能会对你有所帮助:https://arxiv.org/abs/1904.01906
https://stackoverflow.com/questions/72569340
复制相似问题