首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >问答首页 >facebook/nllb-200-3.3B向AWS神经元的转换

facebook/nllb-200-3.3B向AWS神经元的转换
EN

Stack Overflow用户
提问于 2022-08-23 16:29:58
回答 1查看 186关注 0票数 0

我正在尝试将Facebook开发的新翻译模式(Meta),不留下任何语言,转换为AWS的神经元模型,该模型可以与使用Inferentia芯片的AWS SageMaker推理一起使用。但是,我不知道如何在没有错误的情况下跟踪模型。这个帖子正好展示了我想要做的事情,并为AWS开发人员工作。为了清晰起见,我也会将代码复制到这里:

代码语言:javascript
运行
复制
import copy
import itertools
from typing import List, Optional, Tuple

import torch
import torch.nn.functional as F

from transformers import M2M100Config
from transformers.generation_utils import GenerationMixin


def _convert_past_list_to_tuple(past_key_values):
    """
    In Bart model, the type of past_key_values is tuple(tuple(torch.FloatTensor)) which is not
    TorchScript-compatible. To support this, we have to convert it during the export process.
    This function will convert past values from a list to tuple(tuple(torch.FloatTensor)) for
    the inner decoder.

    According to the definition of past_key_values, each inner tuple(torch.FloatTensor) has 4 tensors,
    so we convert every 4 elements in the list as a tuple(torch.FloatTensor).
    """
    count_of_each_inner_tuple = 4
    results = ()
    temp_result = ()
    count_n = len(past_key_values) // count_of_each_inner_tuple
    for idx in range(count_n):
        real_idx = idx * count_of_each_inner_tuple
        temp_result = tuple(past_key_values[real_idx : real_idx + count_of_each_inner_tuple])
        results += ((temp_result),)

    return results


class EncoderForONNX(torch.nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder

    def forward(self, input_ids, attention_mask):
        return self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False,
        )


class DecoderForONNX(torch.nn.Module):
    def __init__(self, decoder):
        super().__init__()
        self.decoder = decoder

    def forward(self, input_ids, encoder_state, attention_mask, past=None):
        all_results = None
        if past is not None:
            all_results = _convert_past_list_to_tuple(past)
            input_ids = input_ids[:, -1:]

        last_hidden_state, past_key_values = self.decoder(
            input_ids=input_ids,
            encoder_hidden_states=encoder_state,
            encoder_attention_mask=attention_mask,
            past_key_values=all_results,
            return_dict=False,
        )

        past_values = []
        for past in past_key_values:
            past_values = past_values + list(past)
        return last_hidden_state, past_values


def _create_traced_encoder(encoder, input_ids, attention_mask):
    encoder_c = copy.deepcopy(encoder)
    print("shapes",input_ids.shape, attention_mask.shape)
    encoder_for_onnx = EncoderForONNX(encoder_c)
    compiler_args = ['--fp32-cast', 'matmult', '--fast-math', 'no-fast-relayout']
    inputs = (
        input_ids,
        attention_mask,
        )

    return torch_neuron.trace(encoder_for_onnx, inputs,compiler_args=compiler_args)


def _create_traced_decoder(decoder, input_ids, encoder_state, attention_mask, past=None):
    decoder_c = copy.deepcopy(decoder)
    print(input_ids.shape,encoder_state.shape,attention_mask.shape)
    decoder_for_onnx = DecoderForONNX(decoder_c)
    past_values = list(itertools.chain.from_iterable(past or ()))
    compiler_args = ['--fp32-cast', 'matmult', '--fast-math', 'no-fast-relayout']
    print(past_values)
    # Do this twice so we got 2 different decoders for further work.
    if past_values:
        inputs = (
            input_ids,
            encoder_state,
            attention_mask,
            past_values,
        )
        return torch_neuron.trace(decoder_for_onnx, inputs,compiler_args=compiler_args)
    else:
        inputs = (
            input_ids,
            encoder_state,
            attention_mask,
        )
        return torch_neuron.trace(decoder_for_onnx, inputs,compiler_args=compiler_args)


class M2M100ConfigTS(M2M100Config, torch.nn.Module):
    """
    BartConfigTS is a TorchScript-compatible transformers.models.bart.configuration_bart.BartConfig.
    TorchScript only supports sub-classes of torch.nn.Module.
    """

    def __init__(self, config):
        M2M100Config.__init__(self, config)
        torch.nn.Module.__init__(self)


class MinLengthLogitsProcessorTS(torch.nn.Module):
    r"""
    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.

    Args:
        min_length (:obj:`int`):
            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
        eos_token_id (:obj:`int`):
            The id of the `end-of-sequence` token.
    """

    def __init__(self, min_length: int, eos_token_id: int):
        super().__init__()

        if not isinstance(min_length, int) or min_length < 0:
            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")

        if not isinstance(eos_token_id, int) or eos_token_id < 0:
            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")

        self.min_length = min_length
        self.eos_token_id = eos_token_id

    def forward(self, input_ids, scores) -> torch.Tensor:
        cur_len = input_ids.shape[-1]
        if cur_len < self.min_length:
            scores[:, self.eos_token_id] = -float("inf")
        return scores


class NLLBGenerator(torch.nn.Module, GenerationMixin):
    def __init__(self, model):
        super().__init__()
        self.config = M2M100ConfigTS(model.config)
        self.config.force_bos_token_to_be_generated = False
        self._trace_modules(model)
        self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
        self.final_logits_weight = model.model.shared.weight
        self.final_logits_bias = model.final_logits_bias
        self.decoder_layers = model.config.decoder_layers
        self.d_model = model.config.d_model

    def _trace_modules(self, model):
        # input_ids = torch.tensor(
        #     [
        #         [
        #             19,669,18,420,8,664,57,42,8,664,21,3028,195,4445,331,1293,34,21,10,6174,1100,6,69,104,42,32,2621,1638,144,4,6174,558,108,4419,1091,28,4,1668,9,1509,1621,279,35,867,2734,85,11,2216,2734,85,203,2244,7,6,15,8102,7,57,8629,5,
        #             model.config.eos_token_id,
        #         ]
        #     ],
        #     device=model.device,
        #     dtype=torch.long,
        # )
        # attention_mask = torch.tensor(
        #     [[True] * input_ids.shape[-1]],
        #     device=model.device,
        #     dtype=torch.bool,
        # )
        pegasus_text = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires."
        model_name = "sshleifer/distilbart-cnn-12-6"

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        inputs = tokenizer(pegasus_text , return_tensors="pt", max_length=32, truncation=True, padding='max_length')
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
        encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
        decoder = model.model.decoder
        decoder_outputs = decoder(input_ids, attention_mask, encoder_outputs["last_hidden_state"], None, None, None)
        # print(decoder_outputs[1])
        # print(decoder_outputs[1].shape)
        self.decoder_no_past = _create_traced_decoder(
            model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask
        )
        self.decoder_with_past = _create_traced_decoder(
            model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask, decoder_outputs[1]
        )

    def _encoder_forward(self, input_ids, attention_mask):
        return self.encoder(input_ids, attention_mask)[0]

    @staticmethod
    def _init_sequence_length_for_generation(
        input_ids: torch.LongTensor, max_length: int
    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
        unfinished_sequences = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + 1
        sequence_lengths = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + max_length

        cur_len = input_ids.shape[-1]
        return sequence_lengths, unfinished_sequences, cur_len

    def _decoder_forward(self, input_ids, encoder_output, attention_mask, past: List[torch.Tensor]):
        # Update here to use different decoder for different values of past.
        if past is None or len(past) == 0:
            decoder_output, past = self.decoder_no_past(
                input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask
            )
        else:
            decoder_output, past = self.decoder_with_past(
                input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask, past=past
            )

        lm_logits = F.linear(decoder_output, self.final_logits_weight, bias=self.final_logits_bias)

        return lm_logits, past

    def greedy_search(
        self, input_ids, encoder_output, attention_mask, max_length, pad_token_id: int, eos_token_id: int
    ):
        # init sequence length tensors
        sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
            input_ids, max_length
        )

        past: List[torch.Tensor] = []
        while cur_len < max_length:

            logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past)
            next_token_logits = logits[:, -1, :]

            # pre-process distribution
            scores = self.logits_processor(input_ids, next_token_logits)

            # argmax
            next_tokens = torch.argmax(scores, dim=-1)

            # add code that transfomers next_tokens to tokens_to_add
            if eos_token_id is not None:
                assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
                next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)

            # add token and increase length by one
            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)

            # update sequence length
            if eos_token_id is not None:
                sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation(
                    sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
                )

            # stop when there is a </s> in each sentence, or if we exceed the maximul length
            if unfinished_sequences.max() == 0:
                break

            # increase cur_len
            cur_len = cur_len + 1

        return input_ids

    def _prepare_decoder_input_ids_for_generation(
        self,
        input_ids: torch.LongTensor,
        decoder_start_token_id,
        bos_token_id: Optional[int] = None,
    ) -> torch.LongTensor:

        decoder_input_ids = (
            torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device)
            * decoder_start_token_id
        )
        return decoder_input_ids

    def forward(self, input_ids, attention_mask, max_length, decoder_start_token_id):
        pad_token_id = self.config.pad_token_id
        bos_token_id = self.config.bos_token_id
        eos_token_id = self.config.eos_token_id

        # special case if pad_token_id is not defined
        if pad_token_id is None and eos_token_id is not None:
            # Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.
            pad_token_id = eos_token_id

        encoder_output = self._encoder_forward(input_ids, attention_mask)

        input_ids = self._prepare_decoder_input_ids_for_generation(
            input_ids,
            decoder_start_token_id=decoder_start_token_id,
            bos_token_id=bos_token_id,
        )

        return self.greedy_search(
            input_ids,
            encoder_output,
            attention_mask,
            max_length=max_length,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
        )
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
import torch
import torch_neuron


neuron_model = NLLBGenerator(model)

我现在收到的错误是:

代码语言:javascript
运行
复制
/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/transformers/models/m2m_100/modeling_m2m_100.py:326: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
INFO:Neuron:There are 1 ops of 1 different types in the TorchScript that are not compiled by neuron-cc: aten::embedding, (For more information see https://github.com/aws/aws-neuron-sdk/blob/master/release-notes/neuron-cc-ops/neuron-cc-ops-pytorch.md)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 1479, fused = 1456, percent fused = 98.44%
INFO:Neuron:Number of neuron graph operations 3581 did not match traced graph 3283 - using heuristic matching of hierarchical information
WARNING:Neuron:torch.neuron.trace failed on _NeuronGraph$1631; falling back to native python function call
ERROR:Neuron:Error parsing message with type 'tensorflow.GraphDef'
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py", line 382, in op_converter
    item, inputs, compiler_workdir=sg_workdir, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/decorators.py", line 82, in trace
    graph_def = graph.as_graph_def()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3238, in as_graph_def
    result, _ = self._as_graph_def(from_version, add_shapes)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3166, in _as_graph_def
    graph.ParseFromString(compat.as_bytes(data))
google.protobuf.message.DecodeError: Error parsing message with type 'tensorflow.GraphDef'
INFO:Neuron:Number of arithmetic operators (post-compilation) before = 1479, compiled = 0, percent compiled = 0.0%
INFO:Neuron:The neuron partitioner created 1 sub-graphs
INFO:Neuron:Neuron successfully compiled 0 sub-graphs, Total fused subgraphs = 1, Percent of model sub-graphs successfully compiled = 0.0%
INFO:Neuron:Compiled these operators (and operator counts) to Neuron:
INFO:Neuron:Not compiled operators (and operator counts) to Neuron:
INFO:Neuron: => aten::Int: 414 [supported]
INFO:Neuron: => aten::add: 75 [supported]
INFO:Neuron: => aten::bmm: 48 [supported]
INFO:Neuron: => aten::contiguous: 72 [supported]
INFO:Neuron: => aten::cumsum: 1 [supported]
INFO:Neuron: => aten::detach: 1 [supported]
INFO:Neuron: => aten::dropout: 97 [supported]
INFO:Neuron: => aten::embedding: 1 [not supported]
INFO:Neuron: => aten::expand: 1 [supported]
INFO:Neuron: => aten::index_select: 1 [supported]
INFO:Neuron: => aten::layer_norm: 49 [supported]
INFO:Neuron: => aten::linear: 144 [supported]
INFO:Neuron: => aten::masked_fill: 1 [supported]
INFO:Neuron: => aten::mul: 74 [supported]
INFO:Neuron: => aten::ne: 1 [supported]
INFO:Neuron: => aten::relu: 24 [supported]
INFO:Neuron: => aten::reshape: 24 [supported]
INFO:Neuron: => aten::rsub: 1 [supported]
INFO:Neuron: => aten::size: 77 [supported]
INFO:Neuron: => aten::slice: 2 [supported]
INFO:Neuron: => aten::softmax: 24 [supported]
INFO:Neuron: => aten::to: 5 [supported]
INFO:Neuron: => aten::transpose: 120 [supported]
INFO:Neuron: => aten::type_as: 1 [supported]
INFO:Neuron: => aten::unsqueeze: 2 [supported]
INFO:Neuron: => aten::view: 219 [supported]
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_4519/3952284984.py in <module>
    314 
    315 
--> 316 neuron_model = NLLBGenerator(model)

/tmp/ipykernel_4519/3952284984.py in __init__(self, model)
    154         self.config = M2M100ConfigTS(model.config)
    155         self.config.force_bos_token_to_be_generated = False
--> 156         self._trace_modules(model)
    157         self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
    158         self.final_logits_weight = model.model.shared.weight

/tmp/ipykernel_4519/3952284984.py in _trace_modules(self, model)
    185         attention_mask = inputs["attention_mask"]
    186 
--> 187         self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
    188         encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
    189         decoder = model.model.decoder

/tmp/ipykernel_4519/3952284984.py in _create_traced_encoder(encoder, input_ids, attention_mask)
     80         )
     81 
---> 82     return torch_neuron.trace(encoder_for_onnx, inputs,compiler_args=compiler_args)
     83 
     84 

~/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py in trace(func, example_inputs, fallback, op_whitelist, minimum_segment_size, subgraph_builder_function, subgraph_inputs_pruning, skip_compiler, debug_must_trace, allow_no_ops_on_neuron, compiler_workdir, dynamic_batch_size, compiler_timeout, _neuron_trace, compiler_args, optimizations, verbose, **kwargs)
    182         logger.debug("skip_inference_context - trace with fallback at {}".format(get_file_and_line()))
    183         neuron_graph = cu.compile_fused_operators(neuron_graph, **compile_kwargs)
--> 184     cu.stats_post_compiler(neuron_graph)
    185 
    186     # Wrap the compiled version of the model in a script module. Note that this is

~/anaconda3/envs/aws_neuron_pytorch_p37/lib/python3.7/site-packages/torch_neuron/convert.py in stats_post_compiler(self, neuron_graph)
    491         if succesful_compilations == 0 and not self.allow_no_ops_on_neuron:
    492             raise RuntimeError(
--> 493                 "No operations were successfully partitioned and compiled to neuron for this model - aborting trace!")
    494 
    495         if percent_operations_compiled < 50.0:

RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace!

任何帮助都将不胜感激。

EN

回答 1

Stack Overflow用户

发布于 2022-10-03 20:15:39

对您的问题的答复已经发布在了最初的Github问题- https://github.com/aws-neuron/aws-neuron-sdk/issues/420#issuecomment-1220885577上。

-Taylor

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/73462205

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档