文章/答案/技术大牛

发布

社区首页 >问答首页 >facebook/nllb-200-3.3B向AWS神经元的转换

问facebook/nllb-200-3.3B向AWS神经元的转换
EN

Stack Overflow用户

提问于 2022-08-23 16:29:58

回答 1查看 186关注 0票数 0

我正在尝试将Facebook开发的新翻译模式(Meta)，不留下任何语言，转换为AWS的神经元模型，该模型可以与使用Inferentia芯片的AWS SageMaker推理一起使用。但是，我不知道如何在没有错误的情况下跟踪模型。这个帖子正好展示了我想要做的事情，并为AWS开发人员工作。为了清晰起见，我也会将代码复制到这里：

import copy
import itertools
from typing import List, Optional, Tuple

import torch
import torch.nn.functional as F

from transformers import M2M100Config
from transformers.generation_utils import GenerationMixin


def _convert_past_list_to_tuple(past_key_values):
    """
    In Bart model, the type of past_key_values is tuple(tuple(torch.FloatTensor)) which is not
    TorchScript-compatible. To support this, we have to convert it during the export process.
    This function will convert past values from a list to tuple(tuple(torch.FloatTensor)) for
    the inner decoder.

    According to the definition of past_key_values, each inner tuple(torch.FloatTensor) has 4 tensors,
    so we convert every 4 elements in the list as a tuple(torch.FloatTensor).
    """
    count_of_each_inner_tuple = 4
    results = ()
    temp_result = ()
    count_n = len(past_key_values) // count_of_each_inner_tuple
    for idx in range(count_n):
        real_idx = idx * count_of_each_inner_tuple
        temp_result = tuple(past_key_values[real_idx : real_idx + count_of_each_inner_tuple])
        results += ((temp_result),)

    return results


class EncoderForONNX(torch.nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder

    def forward(self, input_ids, attention_mask):
        return self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False,
        )


class DecoderForONNX(torch.nn.Module):
    def __init__(self, decoder):
        super().__init__()
        self.decoder = decoder

    def forward(self, input_ids, encoder_state, attention_mask, past=None):
        all_results = None
        if past is not None:
            all_results = _convert_past_list_to_tuple(past)
            input_ids = input_ids[:, -1:]

        last_hidden_state, past_key_values = self.decoder(
            input_ids=input_ids,
            encoder_hidden_states=encoder_state,
            encoder_attention_mask=attention_mask,
            past_key_values=all_results,
            return_dict=False,
        )

        past_values = []
        for past in past_key_values:
            past_values = past_values + list(past)
        return last_hidden_state, past_values


def _create_traced_encoder(encoder, input_ids, attention_mask):
    encoder_c = copy.deepcopy(encoder)
    print("shapes",input_ids.shape, attention_mask.shape)
    encoder_for_onnx = EncoderForONNX(encoder_c)
    compiler_args = ['--fp32-cast', 'matmult', '--fast-math', 'no-fast-relayout']
    inputs = (
        input_ids,
        attention_mask,
        )

    return torch_neuron.trace(encoder_for_onnx, inputs,compiler_args=compiler_args)


def _create_traced_decoder(decoder, input_ids, encoder_state, attention_mask, past=None):
    decoder_c = copy.deepcopy(decoder)
    print(input_ids.shape,encoder_state.shape,attention_mask.shape)
    decoder_for_onnx = DecoderForONNX(decoder_c)
    past_values = list(itertools.chain.from_iterable(past or ()))
    compiler_args = ['--fp32-cast', 'matmult', '--fast-math', 'no-fast-relayout']
    print(past_values)
    # Do this twice so we got 2 different decoders for further work.
    if past_values:
        inputs = (
            input_ids,
            encoder_state,
            attention_mask,
            past_values,
        )
        return torch_neuron.trace(decoder_for_onnx, inputs,compiler_args=compiler_args)
    else:
        inputs = (
            input_ids,
            encoder_state,
            attention_mask,
        )
        return torch_neuron.trace(decoder_for_onnx, inputs,compiler_args=compiler_args)


class M2M100ConfigTS(M2M100Config, torch.nn.Module):
    """
    BartConfigTS is a TorchScript-compatible transformers.models.bart.configuration_bart.BartConfig.
    TorchScript only supports sub-classes of torch.nn.Module.
    """

    def __init__(self, config):
        M2M100Config.__init__(self, config)
        torch.nn.Module.__init__(self)


class MinLengthLogitsProcessorTS(torch.nn.Module):
    r"""
    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.

    Args:
        min_length (:obj:`int`):
            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
        eos_token_id (:obj:`int`):
            The id of the `end-of-sequence` token.
    """

    def __init__(self, min_length: int, eos_token_id: int):
        super().__init__()

        if not isinstance(min_length, int) or min_length < 0:
            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")

        if not isinstance(eos_token_id, int) or eos_token_id < 0:
            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")

        self.min_length = min_length
        self.eos_token_id = eos_token_id

    def forward(self, input_ids, scores) -> torch.Tensor:
        cur_len = input_ids.shape[-1]
        if cur_len < self.min_length:
            scores[:, self.eos_token_id] = -float("inf")
        return scores


class NLLBGenerator(torch.nn.Module, GenerationMixin):
    def __init__(self, model):
        super().__init__()
        self.config = M2M100ConfigTS(model.config)
        self.config.force_bos_token_to_be_generated = False
        self._trace_modules(model)
        self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
        self.final_logits_weight = model.model.shared.weight
        self.final_logits_bias = model.final_logits_bias
        self.decoder_layers = model.config.decoder_layers
        self.d_model = model.config.d_model

    def _trace_modules(self, model):
        # input_ids = torch.tensor(
        #     [
        #         [
        #             19,669,18,420,8,664,57,42,8,664,21,3028,195,4445,331,1293,34,21,10,6174,1100,6,69,104,42,32,2621,1638,144,4,6174,558,108,4419,1091,28,4,1668,9,1509,1621,279,35,867,2734,85,11,2216,2734,85,203,2244,7,6,15,8102,7,57,8629,5,
        #             model.config.eos_token_id,
        #         ]
        #     ],
        #     device=model.device,
        #     dtype=torch.long,
        # )
        # attention_mask = torch.tensor(
        #     [[True] * input_ids.shape[-1]],
        #     device=model.device,
        #     dtype=torch.bool,
        # )
        pegasus_text = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires."
        model_name = "sshleifer/distilbart-cnn-12-6"

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        inputs = tokenizer(pegasus_text , return_tensors="pt", max_length=32, truncation=True, padding='max_length')
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
        encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
        decoder = model.model.decoder
        decoder_outputs = decoder(input_ids, attention_mask, encoder_outputs["last_hidden_state"], None, None, None)
        # print(decoder_outputs[1])
        # print(decoder_outputs[1].shape)
        self.decoder_no_past = _create_traced_decoder(
            model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask
        )
        self.decoder_with_past = _create_traced_decoder(
            model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask, decoder_outputs[1]
        )

    def _encoder_forward(self, input_ids, attention_mask):
        return self.encoder(input_ids, attention_mask)[0]

    @staticmethod
    def _init_sequence_length_for_generation(
        input_ids: torch.LongTensor, max_length: int
    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
        unfinished_sequences = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + 1
        sequence_lengths = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + max_length

        cur_len = input_ids.shape[-1]
        return sequence_lengths, unfinished_sequences, cur_len

    def _decoder_forward(self, input_ids, encoder_output, attention_mask, past: List[torch.Tensor]):
        # Update here to use different decoder for different values of past.
        if past is None or len(past) == 0:
            decoder_output, past = self.decoder_no_past(
                input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask
            )
        else:
            decoder_output, past = self.decoder_with_past(
                input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask, past=past
            )

        lm_logits = F.linear(decoder_output, self.final_logits_weight, bias=self.final_logits_bias)

        return lm_logits, past

    def greedy_search(
        self, input_ids, encoder_output, attention_mask, max_length, pad_token_id: int, eos_token_id: int
    ):
        # init sequence length tensors
        sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
            input_ids, max_length
        )

        past: List[torch.Tensor] = []
        while cur_len < max_length:

            logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past)
            next_token_logits = logits[:, -1, :]

            # pre-process distribution
            scores = self.logits_processor(input_ids, next_token_logits)

            # argmax
            next_tokens = torch.argmax(scores, dim=-1)

            # add code that transfomers next_tokens to tokens_to_add
            if eos_token_id is not None:
                assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
                next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)

            # add token and increase length by one
            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)

            # update sequence length
            if eos_token_id is not None:
                sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation(
                    sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
                )

            # stop when there is a </s> in each sentence, or if we exceed the maximul length
            if unfinished_sequences.max() == 0:
                break

            # increase cur_len
            cur_len = cur_len + 1

        return input_ids

    def _prepare_decoder_input_ids_for_generation(
        self,
        input_ids: torch.LongTensor,
        decoder_start_token_id,
        bos_token_id: Optional[int] = None,
    ) -> torch.LongTensor:

        decoder_input_ids = (
            torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device)
            * decoder_start_token_id
        )
        return decoder_input_ids

    def forward(self, input_ids, attention_mask, max_length, decoder_start_token_id):
        pad_token_id = self.config.pad_token_id
        bos_token_id = self.config.bos_token_id
        eos_token_id = self.config.eos_token_id

        # special case if pad_token_id is not defined
        if pad_token_id is None and eos_token_id is not None:
            # Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.
            pad_token_id = eos_token_id

        encoder_output = self._encoder_forward(input_ids, attention_mask)

        input_ids = self._prepare_decoder_input_ids_for_generation(
            input_ids,
            decoder_start_token_id=decoder_start_token_id,
            bos_token_id=bos_token_id,
        )

        return self.greedy_search(
            input_ids,
            encoder_output,
            attention_mask,
            max_length=max_length,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
        )
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
import torch
import torch_neuron


neuron_model = NLLBGenerator(model)

我现在收到的错误是：

/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/transformers/models/m2m_100/modeling_m2m_100.py:326: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
INFO:Neuron:There are 1 ops of 1 different types in the TorchScript that are not compiled by neuron-cc: aten::embedding, (For more information see https://github.com/aws/aws-neuron-sdk/blob/master/release-notes/neuron-cc-ops/neuron-cc-ops-pytorch.md)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 1479, fused = 1456, percent fused = 98.44%
INFO:Neuron:Number of neuron graph operations 3581 did not match traced graph 3283 - using heuristic matching of hierarchical information
WARNING:Neuron:torch.neuron.trace failed on _NeuronGraph$1631; falling back to native python function call
ERROR:Neuron:Error parsing message with type 'tensorflow.GraphDef'
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py", line 382, in op_converter
    item, inputs, compiler_workdir=sg_workdir, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/decorators.py", line 82, in trace
    graph_def = graph.as_graph_def()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3238, in as_graph_def
    result, _ = self._as_graph_def(from_version, add_shapes)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3166, in _as_graph_def
    graph.ParseFromString(compat.as_bytes(data))
google.protobuf.message.DecodeError: Error parsing message with type 'tensorflow.GraphDef'
INFO:Neuron:Number of arithmetic operators (post-compilation) before = 1479, compiled = 0, percent compiled = 0.0%
INFO:Neuron:The neuron partitioner created 1 sub-graphs
INFO:Neuron:Neuron successfully compiled 0 sub-graphs, Total fused subgraphs = 1, Percent of model sub-graphs successfully compiled = 0.0%
INFO:Neuron:Compiled these operators (and operator counts) to Neuron:
INFO:Neuron:Not compiled operators (and operator counts) to Neuron:
INFO:Neuron: => aten::Int: 414 [supported]
INFO:Neuron: => aten::add: 75 [supported]
INFO:Neuron: => aten::bmm: 48 [supported]
INFO:Neuron: => aten::contiguous: 72 [supported]
INFO:Neuron: => aten::cumsum: 1 [supported]
INFO:Neuron: => aten::detach: 1 [supported]
INFO:Neuron: => aten::dropout: 97 [supported]
INFO:Neuron: => aten::embedding: 1 [not supported]
INFO:Neuron: => aten::expand: 1 [supported]
INFO:Neuron: => aten::index_select: 1 [supported]
INFO:Neuron: => aten::layer_norm: 49 [supported]
INFO:Neuron: => aten::linear: 144 [supported]
INFO:Neuron: => aten::masked_fill: 1 [supported]
INFO:Neuron: => aten::mul: 74 [supported]
INFO:Neuron: => aten::ne: 1 [supported]
INFO:Neuron: => aten::relu: 24 [supported]
INFO:Neuron: => aten::reshape: 24 [supported]
INFO:Neuron: => aten::rsub: 1 [supported]
INFO:Neuron: => aten::size: 77 [supported]
INFO:Neuron: => aten::slice: 2 [supported]
INFO:Neuron: => aten::softmax: 24 [supported]
INFO:Neuron: => aten::to: 5 [supported]
INFO:Neuron: => aten::transpose: 120 [supported]
INFO:Neuron: => aten::type_as: 1 [supported]
INFO:Neuron: => aten::unsqueeze: 2 [supported]
INFO:Neuron: => aten::view: 219 [supported]
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_4519/3952284984.py in <module>
    314 
    315 
--> 316 neuron_model = NLLBGenerator(model)

/tmp/ipykernel_4519/3952284984.py in __init__(self, model)
    154         self.config = M2M100ConfigTS(model.config)
    155         self.config.force_bos_token_to_be_generated = False
--> 156         self._trace_modules(model)
    157         self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
    158         self.final_logits_weight = model.model.shared.weight

/tmp/ipykernel_4519/3952284984.py in _trace_modules(self, model)
    185         attention_mask = inputs["attention_mask"]
    186 
--> 187         self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
    188         encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
    189         decoder = model.model.decoder

/tmp/ipykernel_4519/3952284984.py in _create_traced_encoder(encoder, input_ids, attention_mask)
     80         )
     81 
---> 82     return torch_neuron.trace(encoder_for_onnx, inputs,compiler_args=compiler_args)
     83 
     84 

~/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py in trace(func, example_inputs, fallback, op_whitelist, minimum_segment_size, subgraph_builder_function, subgraph_inputs_pruning, skip_compiler, debug_must_trace, allow_no_ops_on_neuron, compiler_workdir, dynamic_batch_size, compiler_timeout, _neuron_trace, compiler_args, optimizations, verbose, **kwargs)
    182         logger.debug("skip_inference_context - trace with fallback at {}".format(get_file_and_line()))
    183         neuron_graph = cu.compile_fused_operators(neuron_graph, **compile_kwargs)
--> 184     cu.stats_post_compiler(neuron_graph)
    185 
    186     # Wrap the compiled version of the model in a script module. Note that this is

~/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py in stats_post_compiler(self, neuron_graph)
    491         if succesful_compilations == 0 and not self.allow_no_ops_on_neuron:
    492             raise RuntimeError(
--> 493                 "No operations were successfully partitioned and compiled to neuron for this model - aborting trace!")
    494 
    495         if percent_operations_compiled < 50.0:

RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace!

任何帮助都将不胜感激。

translation

amazon-sagemaker

huggingface-transformers

nlp

pytorch

回答 1

Stack Overflow用户

发布于 2022-10-03 20:15:39

对您的问题的答复已经发布在了最初的Github问题- https://github.com/aws-neuron/aws-neuron-sdk/issues/420#issuecomment-1220885577上。

-Taylor

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/73462205

复制

相似问题

问facebook/nllb-200-3.3B向AWS神经元的转换
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问facebook/nllb-200-3.3B向AWS神经元的转换EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问facebook/nllb-200-3.3B向AWS神经元的转换
EN