我正在尝试将Facebook开发的新翻译模式(Meta),不留下任何语言,转换为AWS的神经元模型,该模型可以与使用Inferentia芯片的AWS SageMaker推理一起使用。但是,我不知道如何在没有错误的情况下跟踪模型。这个帖子正好展示了我想要做的事情,并为AWS开发人员工作。为了清晰起见,我也会将代码复制到这里:
import copy
import itertools
from typing import List, Optional, Tuple
import torch
import torch.nn.functional as F
from transformers import M2M100Config
from transformers.generation_utils import GenerationMixin
def _convert_past_list_to_tuple(past_key_values):
"""
In Bart model, the type of past_key_values is tuple(tuple(torch.FloatTensor)) which is not
TorchScript-compatible. To support this, we have to convert it during the export process.
This function will convert past values from a list to tuple(tuple(torch.FloatTensor)) for
the inner decoder.
According to the definition of past_key_values, each inner tuple(torch.FloatTensor) has 4 tensors,
so we convert every 4 elements in the list as a tuple(torch.FloatTensor).
"""
count_of_each_inner_tuple = 4
results = ()
temp_result = ()
count_n = len(past_key_values) // count_of_each_inner_tuple
for idx in range(count_n):
real_idx = idx * count_of_each_inner_tuple
temp_result = tuple(past_key_values[real_idx : real_idx + count_of_each_inner_tuple])
results += ((temp_result),)
return results
class EncoderForONNX(torch.nn.Module):
def __init__(self, encoder):
super().__init__()
self.encoder = encoder
def forward(self, input_ids, attention_mask):
return self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=False,
)
class DecoderForONNX(torch.nn.Module):
def __init__(self, decoder):
super().__init__()
self.decoder = decoder
def forward(self, input_ids, encoder_state, attention_mask, past=None):
all_results = None
if past is not None:
all_results = _convert_past_list_to_tuple(past)
input_ids = input_ids[:, -1:]
last_hidden_state, past_key_values = self.decoder(
input_ids=input_ids,
encoder_hidden_states=encoder_state,
encoder_attention_mask=attention_mask,
past_key_values=all_results,
return_dict=False,
)
past_values = []
for past in past_key_values:
past_values = past_values + list(past)
return last_hidden_state, past_values
def _create_traced_encoder(encoder, input_ids, attention_mask):
encoder_c = copy.deepcopy(encoder)
print("shapes",input_ids.shape, attention_mask.shape)
encoder_for_onnx = EncoderForONNX(encoder_c)
compiler_args = ['--fp32-cast', 'matmult', '--fast-math', 'no-fast-relayout']
inputs = (
input_ids,
attention_mask,
)
return torch_neuron.trace(encoder_for_onnx, inputs,compiler_args=compiler_args)
def _create_traced_decoder(decoder, input_ids, encoder_state, attention_mask, past=None):
decoder_c = copy.deepcopy(decoder)
print(input_ids.shape,encoder_state.shape,attention_mask.shape)
decoder_for_onnx = DecoderForONNX(decoder_c)
past_values = list(itertools.chain.from_iterable(past or ()))
compiler_args = ['--fp32-cast', 'matmult', '--fast-math', 'no-fast-relayout']
print(past_values)
# Do this twice so we got 2 different decoders for further work.
if past_values:
inputs = (
input_ids,
encoder_state,
attention_mask,
past_values,
)
return torch_neuron.trace(decoder_for_onnx, inputs,compiler_args=compiler_args)
else:
inputs = (
input_ids,
encoder_state,
attention_mask,
)
return torch_neuron.trace(decoder_for_onnx, inputs,compiler_args=compiler_args)
class M2M100ConfigTS(M2M100Config, torch.nn.Module):
"""
BartConfigTS is a TorchScript-compatible transformers.models.bart.configuration_bart.BartConfig.
TorchScript only supports sub-classes of torch.nn.Module.
"""
def __init__(self, config):
M2M100Config.__init__(self, config)
torch.nn.Module.__init__(self)
class MinLengthLogitsProcessorTS(torch.nn.Module):
r"""
:class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
Args:
min_length (:obj:`int`):
The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
eos_token_id (:obj:`int`):
The id of the `end-of-sequence` token.
"""
def __init__(self, min_length: int, eos_token_id: int):
super().__init__()
if not isinstance(min_length, int) or min_length < 0:
raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
if not isinstance(eos_token_id, int) or eos_token_id < 0:
raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
self.min_length = min_length
self.eos_token_id = eos_token_id
def forward(self, input_ids, scores) -> torch.Tensor:
cur_len = input_ids.shape[-1]
if cur_len < self.min_length:
scores[:, self.eos_token_id] = -float("inf")
return scores
class NLLBGenerator(torch.nn.Module, GenerationMixin):
def __init__(self, model):
super().__init__()
self.config = M2M100ConfigTS(model.config)
self.config.force_bos_token_to_be_generated = False
self._trace_modules(model)
self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
self.final_logits_weight = model.model.shared.weight
self.final_logits_bias = model.final_logits_bias
self.decoder_layers = model.config.decoder_layers
self.d_model = model.config.d_model
def _trace_modules(self, model):
# input_ids = torch.tensor(
# [
# [
# 19,669,18,420,8,664,57,42,8,664,21,3028,195,4445,331,1293,34,21,10,6174,1100,6,69,104,42,32,2621,1638,144,4,6174,558,108,4419,1091,28,4,1668,9,1509,1621,279,35,867,2734,85,11,2216,2734,85,203,2244,7,6,15,8102,7,57,8629,5,
# model.config.eos_token_id,
# ]
# ],
# device=model.device,
# dtype=torch.long,
# )
# attention_mask = torch.tensor(
# [[True] * input_ids.shape[-1]],
# device=model.device,
# dtype=torch.bool,
# )
pegasus_text = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires."
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(pegasus_text , return_tensors="pt", max_length=32, truncation=True, padding='max_length')
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
decoder = model.model.decoder
decoder_outputs = decoder(input_ids, attention_mask, encoder_outputs["last_hidden_state"], None, None, None)
# print(decoder_outputs[1])
# print(decoder_outputs[1].shape)
self.decoder_no_past = _create_traced_decoder(
model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask
)
self.decoder_with_past = _create_traced_decoder(
model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask, decoder_outputs[1]
)
def _encoder_forward(self, input_ids, attention_mask):
return self.encoder(input_ids, attention_mask)[0]
@staticmethod
def _init_sequence_length_for_generation(
input_ids: torch.LongTensor, max_length: int
) -> Tuple[torch.Tensor, torch.Tensor, int]:
unfinished_sequences = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + 1
sequence_lengths = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + max_length
cur_len = input_ids.shape[-1]
return sequence_lengths, unfinished_sequences, cur_len
def _decoder_forward(self, input_ids, encoder_output, attention_mask, past: List[torch.Tensor]):
# Update here to use different decoder for different values of past.
if past is None or len(past) == 0:
decoder_output, past = self.decoder_no_past(
input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask
)
else:
decoder_output, past = self.decoder_with_past(
input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask, past=past
)
lm_logits = F.linear(decoder_output, self.final_logits_weight, bias=self.final_logits_bias)
return lm_logits, past
def greedy_search(
self, input_ids, encoder_output, attention_mask, max_length, pad_token_id: int, eos_token_id: int
):
# init sequence length tensors
sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
input_ids, max_length
)
past: List[torch.Tensor] = []
while cur_len < max_length:
logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past)
next_token_logits = logits[:, -1, :]
# pre-process distribution
scores = self.logits_processor(input_ids, next_token_logits)
# argmax
next_tokens = torch.argmax(scores, dim=-1)
# add code that transfomers next_tokens to tokens_to_add
if eos_token_id is not None:
assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)
# add token and increase length by one
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
# update sequence length
if eos_token_id is not None:
sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation(
sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
)
# stop when there is a </s> in each sentence, or if we exceed the maximul length
if unfinished_sequences.max() == 0:
break
# increase cur_len
cur_len = cur_len + 1
return input_ids
def _prepare_decoder_input_ids_for_generation(
self,
input_ids: torch.LongTensor,
decoder_start_token_id,
bos_token_id: Optional[int] = None,
) -> torch.LongTensor:
decoder_input_ids = (
torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device)
* decoder_start_token_id
)
return decoder_input_ids
def forward(self, input_ids, attention_mask, max_length, decoder_start_token_id):
pad_token_id = self.config.pad_token_id
bos_token_id = self.config.bos_token_id
eos_token_id = self.config.eos_token_id
# special case if pad_token_id is not defined
if pad_token_id is None and eos_token_id is not None:
# Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.
pad_token_id = eos_token_id
encoder_output = self._encoder_forward(input_ids, attention_mask)
input_ids = self._prepare_decoder_input_ids_for_generation(
input_ids,
decoder_start_token_id=decoder_start_token_id,
bos_token_id=bos_token_id,
)
return self.greedy_search(
input_ids,
encoder_output,
attention_mask,
max_length=max_length,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
)
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
import torch
import torch_neuron
neuron_model = NLLBGenerator(model)
我现在收到的错误是:
/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/transformers/models/m2m_100/modeling_m2m_100.py:326: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
INFO:Neuron:There are 1 ops of 1 different types in the TorchScript that are not compiled by neuron-cc: aten::embedding, (For more information see https://github.com/aws/aws-neuron-sdk/blob/master/release-notes/neuron-cc-ops/neuron-cc-ops-pytorch.md)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 1479, fused = 1456, percent fused = 98.44%
INFO:Neuron:Number of neuron graph operations 3581 did not match traced graph 3283 - using heuristic matching of hierarchical information
WARNING:Neuron:torch.neuron.trace failed on _NeuronGraph$1631; falling back to native python function call
ERROR:Neuron:Error parsing message with type 'tensorflow.GraphDef'
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py", line 382, in op_converter
item, inputs, compiler_workdir=sg_workdir, **kwargs)
File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/decorators.py", line 82, in trace
graph_def = graph.as_graph_def()
File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3238, in as_graph_def
result, _ = self._as_graph_def(from_version, add_shapes)
File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3166, in _as_graph_def
graph.ParseFromString(compat.as_bytes(data))
google.protobuf.message.DecodeError: Error parsing message with type 'tensorflow.GraphDef'
INFO:Neuron:Number of arithmetic operators (post-compilation) before = 1479, compiled = 0, percent compiled = 0.0%
INFO:Neuron:The neuron partitioner created 1 sub-graphs
INFO:Neuron:Neuron successfully compiled 0 sub-graphs, Total fused subgraphs = 1, Percent of model sub-graphs successfully compiled = 0.0%
INFO:Neuron:Compiled these operators (and operator counts) to Neuron:
INFO:Neuron:Not compiled operators (and operator counts) to Neuron:
INFO:Neuron: => aten::Int: 414 [supported]
INFO:Neuron: => aten::add: 75 [supported]
INFO:Neuron: => aten::bmm: 48 [supported]
INFO:Neuron: => aten::contiguous: 72 [supported]
INFO:Neuron: => aten::cumsum: 1 [supported]
INFO:Neuron: => aten::detach: 1 [supported]
INFO:Neuron: => aten::dropout: 97 [supported]
INFO:Neuron: => aten::embedding: 1 [not supported]
INFO:Neuron: => aten::expand: 1 [supported]
INFO:Neuron: => aten::index_select: 1 [supported]
INFO:Neuron: => aten::layer_norm: 49 [supported]
INFO:Neuron: => aten::linear: 144 [supported]
INFO:Neuron: => aten::masked_fill: 1 [supported]
INFO:Neuron: => aten::mul: 74 [supported]
INFO:Neuron: => aten::ne: 1 [supported]
INFO:Neuron: => aten::relu: 24 [supported]
INFO:Neuron: => aten::reshape: 24 [supported]
INFO:Neuron: => aten::rsub: 1 [supported]
INFO:Neuron: => aten::size: 77 [supported]
INFO:Neuron: => aten::slice: 2 [supported]
INFO:Neuron: => aten::softmax: 24 [supported]
INFO:Neuron: => aten::to: 5 [supported]
INFO:Neuron: => aten::transpose: 120 [supported]
INFO:Neuron: => aten::type_as: 1 [supported]
INFO:Neuron: => aten::unsqueeze: 2 [supported]
INFO:Neuron: => aten::view: 219 [supported]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_4519/3952284984.py in <module>
314
315
--> 316 neuron_model = NLLBGenerator(model)
/tmp/ipykernel_4519/3952284984.py in __init__(self, model)
154 self.config = M2M100ConfigTS(model.config)
155 self.config.force_bos_token_to_be_generated = False
--> 156 self._trace_modules(model)
157 self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
158 self.final_logits_weight = model.model.shared.weight
/tmp/ipykernel_4519/3952284984.py in _trace_modules(self, model)
185 attention_mask = inputs["attention_mask"]
186
--> 187 self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
188 encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
189 decoder = model.model.decoder
/tmp/ipykernel_4519/3952284984.py in _create_traced_encoder(encoder, input_ids, attention_mask)
80 )
81
---> 82 return torch_neuron.trace(encoder_for_onnx, inputs,compiler_args=compiler_args)
83
84
~/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py in trace(func, example_inputs, fallback, op_whitelist, minimum_segment_size, subgraph_builder_function, subgraph_inputs_pruning, skip_compiler, debug_must_trace, allow_no_ops_on_neuron, compiler_workdir, dynamic_batch_size, compiler_timeout, _neuron_trace, compiler_args, optimizations, verbose, **kwargs)
182 logger.debug("skip_inference_context - trace with fallback at {}".format(get_file_and_line()))
183 neuron_graph = cu.compile_fused_operators(neuron_graph, **compile_kwargs)
--> 184 cu.stats_post_compiler(neuron_graph)
185
186 # Wrap the compiled version of the model in a script module. Note that this is
~/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py in stats_post_compiler(self, neuron_graph)
491 if succesful_compilations == 0 and not self.allow_no_ops_on_neuron:
492 raise RuntimeError(
--> 493 "No operations were successfully partitioned and compiled to neuron for this model - aborting trace!")
494
495 if percent_operations_compiled < 50.0:
RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace!
任何帮助都将不胜感激。
发布于 2022-10-03 20:15:39
对您的问题的答复已经发布在了最初的Github问题- https://github.com/aws-neuron/aws-neuron-sdk/issues/420#issuecomment-1220885577上。
-Taylor
https://stackoverflow.com/questions/73462205
复制相似问题