前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >CAIL2021-阅读理解任务-数据预处理模块(二)

CAIL2021-阅读理解任务-数据预处理模块(二)

作者头像
西西嘛呦
发布2022-06-10 19:01:28
3890
发布2022-06-10 19:01:28
举报

代码地址:https://github.com/china-ai-law-challenge/CAIL2021/

代码语言:javascript
复制
# /*
#  * @Author: Yue.Fan 
#  * @Date: 2022-03-23 11:35:28 
#  * @Last Modified by:   Yue.Fan 
#  * @Last Modified time: 2022-03-23 11:35:28 
#  */
import logging
from dataclasses import dataclass
from typing import List, Dict
import json
from tqdm import tqdm
from transformers import PreTrainedTokenizer, BasicTokenizer, BertTokenizer
from transformers.tokenization_utils import _is_whitespace, _is_punctuation, _is_control
import numpy as np
import torch
from torch.utils.data import Dataset, TensorDataset

YES_TOKEN = "[unused1]"
NO_TOKEN = "[unused2]"


class CAILExample:
    def __init__(self,
                 qas_id: str,
                 question_text: str,
                 context_text: str,
                 answer_texts: List[str],
                 answer_start_indexes: List[int],
                 is_impossible: bool,
                 is_yes_no: bool,
                 is_multi_span: bool,
                 answers: List,
                 case_id: str,
                 case_name: str):
        self.qas_id = qas_id  # 每一个问题都有一个唯一的id
        self.question_text = question_text  # 问题文本
        self.context_text = context_text  # 内容文本
        self.answer_texts = answer_texts  # 答案列表
        self.answer_start_indexes = answer_start_indexes # 答案开始位置列表
        self.is_impossible = is_impossible  # 是否不存在答案
        self.is_yes_no = is_yes_no  # 是否是 是否类
        self.is_multi_span = is_multi_span  # 是否是 多片段类
        self.answers = answers  # 未经处理的答案列表
        self.case_id = case_id  # 每一个内容都有一个唯一的案件id
        self.case_name = case_name  # 案件类型

        self.doc_tokens = []
        self.char_to_word_offset = []

        raw_doc_tokens = customize_tokenizer(context_text, True)  # 初步得到token
        k = 0
        temp_word = ""
        # 有的文本中会存在空格、换行等,使用bert会导致答案的偏移
        # 因此才会有char_to_word_offset,举个例子
        """
        我\n\t爱北京\n\t天安门
        ['我', '爱', '北', '京', '天', '安', '门']
        [0, 0, 0, 1, 2, 3, 3, 3, 4, 5, 6]
        这里从0-1之间多了两个0,表明我和爱之间有两个空格
        """
        for char in self.context_text:
            if _is_whitespace(char):
                self.char_to_word_offset.append(k - 1)
                continue
            else:
                temp_word += char
                self.char_to_word_offset.append(k)
            if temp_word.lower() == raw_doc_tokens[k]:
                self.doc_tokens.append(temp_word)
                temp_word = ""
                k += 1
        assert k == len(raw_doc_tokens)

        if answer_texts is not None:  # if for training
            start_positions = []
            end_positions = []

            if not is_impossible and not is_yes_no:
                for i in range(len(answer_texts)):
                    # 这里还是以上面例子为例
                    # 北京在原始文本的开始位置是4
                    answer_offset = context_text.index(answer_texts[i])  # 这里直接index不太好吧
                    # answer_offset = answer_start_indexes[i]
                    answer_length = len(answer_texts[i])
                    start_position = self.char_to_word_offset[answer_offset]  # 在doc_tokens里面的位置就是
                    end_position = self.char_to_word_offset[answer_offset + answer_length - 1]
                    start_positions.append(start_position)  # 真正的开始位置
                    end_positions.append(end_position)  # 真正的结束位置
            else:
                start_positions.append(-1)  # 不存在答案就设置为-1
                end_positions.append(-1)  # 不存在答案就设置为-1
            self.start_positions = start_positions
            self.end_positions = end_positions

    def __repr__(self):
        string = ""
        for key, value in self.__dict__.items():
            string += f"{key}: {value}\n"
        # return f"<{self.__class__}>"
        return string


@dataclass
class CAILFeature:
    input_ids: List[int]
    attention_mask: List[int]
    token_type_ids: List[int]
    cls_index: int
    p_mask: List
    example_index: int
    unique_id: int
    paragraph_len: int
    token_is_max_context: object
    tokens: List
    token_to_orig_map: Dict
    start_positions: List[int]
    end_positions: List[int]
    is_impossible: bool


@dataclass
class CAILResult:
    unique_id: int
    start_logits: torch.Tensor
    end_logits: torch.Tensor


def read_examples(file: str, is_training: bool) -> List[CAILExample]:
    example_list = []
    with open(file, "r", encoding="utf-8") as file:
        original_data = json.load(file)["data"]

    for entry in tqdm(original_data):
        case_id = entry["caseid"]
        for paragraph in entry["paragraphs"]:
            context = paragraph["context"]
            case_name = paragraph["casename"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                qas_id = qa["id"]
                answer_texts = None
                answer_starts = None
                is_impossible = None
                is_yes_no = None
                is_multi_span = None
                all_answers = None
                # cail2021包含以下击中答案:单片段、是否类和拒答类的问题类型,相较于之前的,
                # 额外引入了多片段类型,即答案是由多个片段组合而成
                if is_training:
                    all_answers = qa["answers"]
                    # all_answers为[],说明没有答案
                    if len(all_answers) == 0:
                        answer = []
                    else:
                        # 否则取第0个
                        answer = all_answers[0]
                    # a little difference between 19 and 21 data.
                    # 如果是一个字典的话将其用列表包裹
                    if type(answer) == dict:
                        answer = [answer]
                    # 不存在答案就初始化答案的文本为"",答案起始位置设置为-1
                    if len(answer) == 0:  # NO Answer
                        answer_texts = [""]
                        answer_starts = [-1]
                    else:
                        # 否则的话这里整合答案
                        answer_texts = []
                        answer_starts = []
                        # 如果是单个span,就是一个
                        # 否则的话就遍历一下
                        for a in answer:
                            answer_texts.append(a["text"])
                            answer_starts.append(a["answer_start"])
                    # Judge YES or NO
                    # 判断是否是 是还是否类型的,并进行设置
                    if len(answer_texts) == 1 and answer_starts[0] == -1 and (
                            answer_texts[0] == "YES" or answer_texts[0] == "NO"):
                        is_yes_no = True
                    else:
                        is_yes_no = False
                    # Judge Multi Span
                    # 判断是否是由多个span构成的答案
                    if len(answer_texts) > 1:
                        is_multi_span = True
                    else:
                        is_multi_span = False
                    # Judge No Answer
                    # 如果不存在答案的话用以下的进行标识
                    if len(answer_texts) == 1 and answer_texts[0] == "":
                        is_impossible = True
                    else:
                        is_impossible = False

                example = CAILExample(
                    qas_id=qas_id,
                    question_text=question,
                    context_text=context,
                    answer_texts=answer_texts,
                    answer_start_indexes=answer_starts,
                    is_impossible=is_impossible,
                    is_yes_no=is_yes_no,
                    is_multi_span=is_multi_span,
                    answers=all_answers,
                    case_id=case_id,
                    case_name=case_name
                )
                # Discard possible bad example
                if is_training and example.answer_start_indexes[0] >= 0:
                    for i in range(len(example.answer_texts)):
                        actual_text = "".join(
                            example.doc_tokens[example.start_positions[i]: (example.end_positions[i] + 1)])
                        cleaned_answer_text = "".join(whitespace_tokenize(example.answer_texts[i]))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logging.info(f"Could not find answer: {actual_text} vs. {cleaned_answer_text}")
                            continue
                example_list.append(example)
    return example_list


def convert_examples_to_features(example_list: List[CAILExample], tokenizer: PreTrainedTokenizer, args,
                                 is_training: bool) -> List[CAILFeature]:
    # Validate there are no duplicate ids in example_list
    qas_id_set = set()
    for example in example_list:
        if example.qas_id in qas_id_set:
            raise Exception("Duplicate qas_id!")
        else:
            qas_id_set.add(example.qas_id)

    feature_list = []
    unique_id = 0
    example_index = 0
    i = 0
    for example in tqdm(example_list):
        i += 1
        # if i % 100 == 0:
        #     print(i)
        current_example_features = convert_single_example_to_features(example, tokenizer, args.max_seq_length,
                                                                      args.max_query_length, args.doc_stride,
                                                                      is_training)
        for feature in current_example_features:
            feature.example_index = example_index
            feature.unique_id = unique_id
            unique_id += 1
        example_index += 1
        feature_list.extend(current_example_features)

    return feature_list


def convert_single_example_to_features(example: CAILExample, tokenizer: PreTrainedTokenizer,
                                       max_seq_length, max_query_length, doc_stride, is_training) -> List[CAILFeature]:
    """
    Transfer original text to sequence which can be accepted by ELECTRA
    Format: [CLS] YES_TOKEN NO_TOKEN question [SEP] context [SEP]
    """
    features = []
    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    """
    ['我', '爱', '北', '京', '15826458891', '天', '安', '门']
    orig_to_tok_index:[0, 1, 2, 3, 4, 9, 10, 11]
    tok_to_orig_index:[0, 1, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7]
    all_doc_tokens:['我', '爱', '北', '京', '158', '##26', '##45', '##88', '##91', '天', '安', '门']
    """
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)  # 这里进一步对token尽可能进行切分
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)  # 每一个sub_token对应的i是相同的
            all_doc_tokens.append(sub_token)

    if is_training:
        if example.is_impossible or example.answer_start_indexes[0] == -1:
            start_positions = [-1]
            end_positions = [-1]
        else:
            start_positions = []
            end_positions = []
            # 以下是对tokenize化之后校准position
            for i in range(len(example.start_positions)):
                start_position = orig_to_tok_index[example.start_positions[i]]
                if example.end_positions[i] < len(example.doc_tokens) - 1:
                    end_position = orig_to_tok_index[example.end_positions[i] + 1] - 1
                else:
                    end_position = len(all_doc_tokens) - 1
                (start_position, end_position) = _improve_answer_span(
                    all_doc_tokens, start_position, end_position, tokenizer, example.answer_texts[i]
                )
                start_positions.append(start_position)
                end_positions.append(end_position)
    else:
        start_positions = None
        end_positions = None

    query_tokens = tokenizer.tokenize(example.question_text)
    query_tokens = [YES_TOKEN, NO_TOKEN] + query_tokens  # 是否类和问题拼接
    truncated_query = tokenizer.encode(query_tokens, add_special_tokens=False, max_length=max_query_length,
                                       truncation=True)

    sequence_pair_added_tokens = tokenizer.num_special_tokens_to_add(pair=True)
    assert sequence_pair_added_tokens == 3

    added_tokens_num_before_second_sequence = tokenizer.num_special_tokens_to_add(pair=False)
    assert added_tokens_num_before_second_sequence == 2
    span_doc_tokens = all_doc_tokens
    spans = []

    # print("query_tokens:", query_tokens)
    # print("all_doc_tokens:", all_doc_tokens)

    # print("".join(all_doc_tokens))
    # print("start_positions:", start_positions)
    # print("end_positions:", end_positions)
    # 这里使用滑动窗口法
    while len(spans) * doc_stride < len(all_doc_tokens):
        # print(max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,)
        # 以步长为doc_stride进行滑窗
        encoded_dict = tokenizer.encode_plus(
            truncated_query,
            span_doc_tokens,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            padding="max_length",
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
            truncation="only_second",
            return_token_type_ids=True
        )
        # print(span_doc_tokens)
        # print("stride:", max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens)
        # print(tokenizer.convert_ids_to_tokens(encoded_dict['input_ids']))
        # print(len(encoded_dict['input_ids']))
        # print(tokenizer.convert_ids_to_tokens(encoded_dict['overflowing_tokens']))
        # 句子的真实长度
        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )
        # 不包含[PAD]的token_ids
        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
        else:
            non_padded_ids = encoded_dict["input_ids"]
        # 重新将ids转换为tokens
        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        token_to_orig_map[0] = -1
        token_to_orig_map[1] = -1
        token_to_orig_map[2] = -1

        token_is_max_context = {0: True, 1: True, 2: True}
        for i in range(paragraph_len):
            # token在输入[CLS]query[SEP]context[SEP]里面的索引
            index = len(truncated_query) + added_tokens_num_before_second_sequence + i
            # tok_to_orig_index是token在context里面的索引
            # spans的长度表明当前总共有几个片段
            # token_to_orig_map是将index映射到真实的i上
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
        # print(token_to_orig_map)
        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(
            truncated_query) + added_tokens_num_before_second_sequence
        encoded_dict["token_is_max_context"] = token_is_max_context
        encoded_dict["start"] = len(spans) * doc_stride  # 文本的起始索引
        encoded_dict["length"] = paragraph_len

        # 这里将是否类的标记token_type_ids设置为1,为什么?
        encoded_dict["token_type_ids"][1] = 1
        encoded_dict["token_type_ids"][2] = 1

        # print(encoded_dict["token_type_ids"])
        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict or len(encoded_dict["overflowing_tokens"]) == 0:
            break
        else:
            span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
            index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context

    for span in spans:
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # p_mask是将问题和SEP对应位置设置为1,其余位置设置为0
        p_mask = np.array(span["token_type_ids"])
        p_mask = np.minimum(p_mask, 1)
        p_mask = 1 - p_mask
        p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
        p_mask[cls_index] = 0
        p_mask[1] = 0
        p_mask[2] = 0


        current_start_positions = None
        current_end_positions = None
        span_is_impossible = None
        if is_training:
            current_start_positions = [0 for i in range(max_seq_length)]
            current_end_positions = [0 for i in range(max_seq_length)]
            doc_start = span["start"]
            doc_end = span["start"] + span["length"] - 1  # 文本的截止索引
            doc_offset = len(truncated_query) + added_tokens_num_before_second_sequence  # 偏移量
            for i in range(len(start_positions)):
                start_position = start_positions[i]
                end_position = end_positions[i]
                # 这里重新整合start_position和end_position
                if start_position >= doc_start and end_position <= doc_end:
                    span_is_impossible = False
                    current_start_positions[start_position - doc_start + doc_offset] = 1
                    current_end_positions[end_position - doc_start + doc_offset] = 1

            # print(current_start_positions)
            # print(current_end_positions)
            # 处理是否类,将1或者2索引位置设置为1
            if example.is_yes_no:
                assert len(example.answer_start_indexes) == 1
                assert 1 not in current_start_positions and 1 not in current_end_positions
                if example.answer_texts[0] == "YES" and example.answer_start_indexes[0] == -1:
                    current_start_positions[1] = 1
                    current_end_positions[1] = 1
                elif example.answer_texts[0] == "NO" and example.answer_start_indexes[0] == -1:
                    current_start_positions[2] = 1
                    current_end_positions[2] = 1
                else:
                    raise Exception("example构造出错,请检查")
                span_is_impossible = False

            # 处理没有答案的类,将0索引位置设置为1
            if 1 not in current_start_positions:  # Current Feature does not contain answer span
                span_is_impossible = True
                current_start_positions[cls_index] = 1
                current_end_positions[cls_index] = 1
            assert span_is_impossible is not None
        features.append(
            CAILFeature(
                input_ids=span["input_ids"],
                attention_mask=span["attention_mask"],
                token_type_ids=span["token_type_ids"],
                cls_index=cls_index,
                p_mask=p_mask.tolist(),
                example_index=0,
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_positions=current_start_positions,
                end_positions=current_end_positions,
                is_impossible=span_is_impossible
            )
        )
    return features


def convert_features_to_dataset(features: List[CAILFeature], is_training: bool) -> Dataset:
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    all_example_indexes = torch.tensor([f.example_index for f in features], dtype=torch.long)
    all_feature_indexes = torch.arange(all_input_ids.size(0), dtype=torch.long)
    if is_training:
        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
        all_start_labels = torch.tensor([f.start_positions for f in features], dtype=torch.float)
        all_end_labels = torch.tensor([f.end_positions for f in features], dtype=torch.float)
        dataset = TensorDataset(
            all_input_ids,
            all_attention_masks,
            all_token_type_ids,
            all_start_labels,
            all_end_labels,
            all_cls_index,
            all_p_mask,
            all_is_impossible,
            all_example_indexes,
            all_feature_indexes
        )
    else:
        dataset = TensorDataset(
            all_input_ids,
            all_attention_masks,
            all_token_type_ids,
            all_cls_index,
            all_p_mask,
            all_example_indexes,
            all_feature_indexes
        )
    return dataset


def _is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False


def _new_check_is_max_context(doc_spans, cur_span_index, position):
    """
    Check if this is the 'max context' doc span for the token.
    """
    # if len(doc_spans) == 1:
    # return True
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span["start"] + doc_span["length"] - 1
        if position < doc_span["start"]:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span["start"]
        num_right_context = end - position
        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index


def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """
    Returns tokenized answer spans that better match the annotated answer.
    """
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start: (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)


def customize_tokenizer(text: str, do_lower_case=True) -> List[str]:
    temp_x = ""
    for char in text:
        # 在一些特殊字符左右插入两个空格
        if _is_chinese_char(ord(char)) or _is_punctuation(char) or _is_whitespace(char) or _is_control(char):
            temp_x += " " + char + " "
        else:
            temp_x += char
    # 是否将英文大写转换为小写
    if do_lower_case:
        temp_x = temp_x.lower()

    return temp_x.split()  # 这里会使用空格进行切分


def _is_chinese_char(cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
    ):  #
        return True

    return False


def whitespace_tokenize(text: str):
    if text is None:
        return []
    text = text.strip()
    tokens = text.split()
    return tokens


def write_example_orig_file(examples: List[CAILExample], file: str):
    """
    convert examples to original json file
    """
    data_list = []
    for example in examples:
        data = {
            "paragraphs": [
                {
                    "context": example.context_text,
                    "casename": example.case_name,
                    "qas": [
                        {
                            "question": example.question_text,
                            "answers": example.answers,
                            "id": example.qas_id,
                            "is_impossible": "true" if example.is_impossible else "false",
                        }
                    ]
                }
            ],
            "caseid": example.case_id
        }
        data_list.append(data)
    final_data = {
        "data": data_list,
        "version": "1.0"
    }
    with open(file, mode="w", encoding="utf-8") as file:
        file.write(json.dumps(final_data, ensure_ascii=False))


if __name__ == '__main__':
    data_file = 'data_sample/cail2021_mrc_small.json'
    examples = read_examples(data_file, is_training=True)
    tokenizer = BertTokenizer.from_pretrained('model_hub/chinese-bert-wwm-ext/')
    # example = examples[3]
    # print(example)
    # print(len(example.doc_tokens))
    # convert_single_example_to_features(
    #     example=example,
    #     tokenizer=tokenizer,
    #     max_seq_length=512,
    #     max_query_length=64,
    #     doc_stride=128,
    #     is_training=True
    # )
    class Args:
        max_seq_length = 512
        max_query_length = 64
        doc_stride = 128
    args = Args()
    feature_lists = convert_examples_to_features(
        examples,
        tokenizer,
        args,
        is_training=True,
    )
    print(feature_lists[0])
    datasets = convert_features_to_dataset(feature_lists, is_training=True)
    print(datasets[0])
    # for ex in examples:
    #     print(ex)
    # context_text = "我\n\t爱北京\n\t15826458891天安门"
    # all_doc_tokens = customize_tokenizer(context_text, True)
    #
    # k = 0
    # temp_word = ""
    # doc_tokens = []
    # char_to_word_offset = []
    # print(context_text)
    # print(all_doc_tokens)
    # for char in context_text:
    #     if _is_whitespace(char):
    #         char_to_word_offset.append(k - 1)
    #         continue
    #     else:
    #         temp_word += char
    #         char_to_word_offset.append(k)
    #     if temp_word.lower() == all_doc_tokens[k]:
    #         doc_tokens.append(temp_word)
    #         temp_word = ""
    #         k += 1
    # print(k)
    # print(doc_tokens)
    # print(char_to_word_offset)
    #
    # tok_to_orig_index = []
    # orig_to_tok_index = []
    # all_doc_tokens = []
    # for (i, token) in enumerate(doc_tokens):
    #     orig_to_tok_index.append(len(all_doc_tokens))
    #     sub_tokens = tokenizer.tokenize(token)
    #     for sub_token in sub_tokens:
    #         tok_to_orig_index.append(i)
    #         all_doc_tokens.append(sub_token)
    #
    # print(orig_to_tok_index)
    # print(tok_to_orig_index)
    # print(all_doc_tokens)

我们需要注意的是总过经过了三个重整的阶段:

  • 第一阶段:先初步将文本进行token化,这一步是去除掉文本中的一些特殊符号及空格等,因此要对答案的起始位置进行校准。
  • 第二阶段:这一步利用tokenizer对每一个字(词)进行token化,由于是wordpiece,会影响句子的长度以及答案,因此也要重新进行校准。
  • 第三阶段:这一步是要整合问题和文本,同时采用滑动窗口法,因此也要重新校准答案在文本中的位置。
代码语言:javascript
复制
<input_ids: [101, 1, 2, 7342, 12124, 1762, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 1905, 2832, 749, 784, 720, 924, 7372, 8043, 102, 9595, 119, 125, 1039, 132, 124, 119, 1161, 808, 7342, 12124, 510, 3342, 10871, 3118, 802, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 2526, 2360, 6589, 11960, 8129, 1039, 132, 125, 119, 7342, 12124, 510, 3342, 10871, 2824, 2857, 3315, 3428, 4638, 6401, 6390, 6589, 4500, 752, 2141, 680, 4415, 4507, 131, 8138, 2399, 8110, 3299, 124, 3189, 117, 7342, 12124, 6206, 3724, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 711, 1071, 1762, 704, 1744, 1093, 689, 7213, 6121, 5500, 819, 3300, 7361, 1062, 1385, 2128, 2551, 4689, 1146, 6121, 5852, 689, 6956, 113, 809, 678, 5042, 4917, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 114, 4638, 8416, 9086, 1039, 6587, 3621, 2990, 897, 702, 782, 3867, 6589, 928, 6587, 924, 6395, 924, 7372, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 2970, 1358, 2832, 924, 2400, 5041, 1355, 924, 7372, 1296, 113, 924, 1296, 5356, 1384, 131, 8752, 9723, 9131, 8756, 9086, 11906, 9446, 8311, 8152, 114, 117, 924, 6589, 5373, 5287, 3175, 2466, 711, 3309, 5373, 117, 3680, 3309, 113, 3299, 114, 9649, 8158, 1039, 117, 924, 7372, 3309, 7313, 5632, 702, 782, 3867, 6589, 928, 6587, 1394, 1398, 7555, 678, 6587, 3621, 1355, 3123, 722, 3189, 6629, 5635, 3926, 985, 1059, 6956, 6587, 3621, 3315, 2622, 722, 3189, 3632, 6421, 924, 1296, 5276, 2137, 117, 7342, 12124, 2870, 3612, 818, 862, 671, 3309, 6587, 3621, 6809, 1168, 8188, 1921, 4638, 117, 6228, 711, 924, 7372, 752, 3125, 1355, 4495, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 898, 2945, 924, 7372, 1394, 1398, 5276, 2137, 2190, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 6822, 6121, 6608, 985, 132, 794, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 6608, 985, 2496, 3189, 6629, 2458, 1993, 6631, 6814, 8114, 1921, 117, 2832, 924, 782, 793, 3313, 1403, 924, 7372, 782, 2495, 6820, 1059, 6956, 6608, 985, 3621, 7555, 4638, 117, 1156, 6228, 711, 2832, 924, 782, 6824, 5276, 117, 2832, 924, 782, 7444, 809, 2213, 3612, 1059, 6956, 3621, 7555, 711, 1825, 3144, 117, 794, 924, 7372, 782, 6608, 985, 2496, 3189, 2458, 1993, 6369, 5050, 117, 2902, 3680, 3189, 1283, 1146, 722, 671, 3403, 1114, 117, 1403, 924, 7372, 782, 5373, 5287, 6824, 5276, 7032, 7342, 12124, 2832, 924, 1400, 117, 794, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 5815, 2533, 2207, 7583, 3867, 6589, 6587, 3621, 1066, 8416, 9086, 1039, 117, 6587, 3621, 4500, 6854, 711, 3189, 2382, 4495, 3833, 3867, 6589, 117, 955, 3621, 3309, 7361, 711, 8216, 702, 3299, 117, 2902, 3299, 5023, 7583, 3315, 2622, 6820, 3621, 1400, 1728, 7342, 12124, 3313, 2130, 1059, 2252, 6121, 1394, 1398, 5276, 2137, 4638, 6820, 3621, 721, 1218, 117, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 754, 8119, 2399, 123, 3299, 8132, 3189, 1403, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 4509, 6435, 5164, 6608, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 102]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
token_type_ids: [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
cls_index: 0
p_mask: [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
example_index: 4
unique_id: 7
paragraph_len: 487
/*
* 提示:该行代码过长,系统自动注释不进行高亮。一键复制会移除系统注释 
* token_is_max_context: {0: True, 1: True, 2: True, 24: False, 25: False, 26: False, 27: False, 28: False, 29: False, 30: False, 31: False, 32: False, 33: False, 34: False, 35: False, 36: False, 37: False, 38: False, 39: False, 40: False, 41: False, 42: False, 43: False, 44: False, 45: False, 46: False, 47: False, 48: False, 49: False, 50: False, 51: False, 52: False, 53: False, 54: False, 55: False, 56: False, 57: False, 58: False, 59: False, 60: False, 61: False, 62: False, 63: False, 64: False, 65: False, 66: False, 67: False, 68: False, 69: False, 70: False, 71: False, 72: False, 73: False, 74: False, 75: False, 76: False, 77: False, 78: False, 79: False, 80: False, 81: False, 82: False, 83: False, 84: False, 85: False, 86: False, 87: False, 88: False, 89: False, 90: False, 91: False, 92: False, 93: False, 94: False, 95: False, 96: False, 97: False, 98: False, 99: False, 100: False, 101: False, 102: False, 103: False, 104: False, 105: False, 106: False, 107: False, 108: False, 109: False, 110: False, 111: False, 112: False, 113: False, 114: False, 115: False, 116: False, 117: False, 118: False, 119: False, 120: False, 121: False, 122: False, 123: False, 124: False, 125: False, 126: False, 127: False, 128: False, 129: False, 130: False, 131: False, 132: False, 133: False, 134: False, 135: False, 136: False, 137: False, 138: False, 139: False, 140: False, 141: False, 142: False, 143: False, 144: False, 145: False, 146: False, 147: False, 148: False, 149: False, 150: False, 151: False, 152: False, 153: False, 154: False, 155: False, 156: False, 157: False, 158: False, 159: False, 160: False, 161: False, 162: False, 163: False, 164: False, 165: False, 166: False, 167: False, 168: False, 169: False, 170: False, 171: False, 172: False, 173: False, 174: False, 175: False, 176: False, 177: False, 178: False, 179: False, 180: False, 181: False, 182: False, 183: False, 184: False, 185: False, 186: False, 187: False, 188: False, 189: False, 190: False, 191: False, 192: False, 193: False, 194: False, 195: False, 196: False, 197: False, 198: False, 199: False, 200: False, 201: False, 202: False, 203: False, 204: True, 205: True, 206: True, 207: True, 208: True, 209: True, 210: True, 211: True, 212: True, 213: True, 214: True, 215: True, 216: True, 217: True, 218: True, 219: True, 220: True, 221: True, 222: True, 223: True, 224: True, 225: True, 226: True, 227: True, 228: True, 229: True, 230: True, 231: True, 232: True, 233: True, 234: True, 235: True, 236: True, 237: True, 238: True, 239: True, 240: True, 241: True, 242: True, 243: True, 244: True, 245: True, 246: True, 247: True, 248: True, 249: True, 250: True, 251: True, 252: True, 253: True, 254: True, 255: True, 256: True, 257: True, 258: True, 259: True, 260: True, 261: True, 262: True, 263: True, 264: True, 265: True, 266: True, 267: True, 268: True, 269: True, 270: True, 271: True, 272: True, 273: True, 274: True, 275: True, 276: True, 277: True, 278: True, 279: True, 280: True, 281: True, 282: True, 283: True, 284: True, 285: True, 286: True, 287: True, 288: True, 289: True, 290: True, 291: True, 292: True, 293: True, 294: True, 295: True, 296: True, 297: True, 298: True, 299: True, 300: True, 301: True, 302: True, 303: True, 304: True, 305: True, 306: True, 307: True, 308: True, 309: True, 310: True, 311: True, 312: True, 313: True, 314: True, 315: True, 316: True, 317: True, 318: True, 319: True, 320: True, 321: True, 322: True, 323: True, 324: True, 325: True, 326: True, 327: True, 328: True, 329: True, 330: True, 331: True, 332: False, 333: False, 334: False, 335: False, 336: False, 337: False, 338: False, 339: False, 340: False, 341: False, 342: False, 343: False, 344: False, 345: False, 346: False, 347: False, 348: False, 349: False, 350: False, 351: False, 352: False, 353: False, 354: False, 355: False, 356: False, 357: False, 358: False, 359: False, 360: False, 361: False, 362: False, 363: False, 364: False, 365: False, 366: False, 367: False, 368: False, 369: False, 370: False, 371: False, 372: False, 373: False, 374: False, 375: False, 376: False, 377: False, 378: False, 379: False, 380: False, 381: False, 382: False, 383: False, 384: False, 385: False, 386: False, 387: False, 388: False, 389: False, 390: False, 391: False, 392: False, 393: False, 394: False, 395: False, 396: False, 397: False, 398: False, 399: False, 400: False, 401: False, 402: False, 403: False, 404: False, 405: False, 406: False, 407: False, 408: False, 409: False, 410: False, 411: False, 412: False, 413: False, 414: False, 415: False, 416: False, 417: False, 418: False, 419: False, 420: False, 421: False, 422: False, 423: False, 424: False, 425: False, 426: False, 427: False, 428: False, 429: False, 430: False, 431: False, 432: False, 433: False, 434: False, 435: False, 436: False, 437: False, 438: False, 439: False, 440: False, 441: False, 442: False, 443: False, 444: False, 445: False, 446: False, 447: False, 448: False, 449: False, 450: False, 451: False, 452: False, 453: False, 454: False, 455: False, 456: False, 457: False, 458: False, 459: False, 460: False, 461: False, 462: False, 463: False, 464: False, 465: False, 466: False, 467: False, 468: False, 469: False, 470: False, 471: False, 472: False, 473: False, 474: False, 475: False, 476: False, 477: False, 478: False, 479: False, 480: False, 481: False, 482: False, 483: False, 484: False, 485: False, 486: False, 487: False, 488: False, 489: False, 490: False, 491: False, 492: False, 493: False, 494: False, 495: False, 496: False, 497: False, 498: False, 499: False, 500: False, 501: False, 502: False, 503: False, 504: False, 505: False, 506: False, 507: False, 508: False, 509: False, 510: False}
*/
tokens: ['[CLS]', '[unused1]', '[unused2]', '阮', 'x4', '在', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '处', '投', '了', '什', '么', '保', '险', '?', '[SEP]', '##92', '.', '4', '元', ';', '3', '.', '判', '令', '阮', 'x4', '、', '杨', 'x5', '支', '付', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '律', '师', '费', '690', '##0', '元', ';', '4', '.', '阮', 'x4', '、', '杨', 'x5', '承', '担', '本', '案', '的', '诉', '讼', '费', '用', '事', '实', '与', '理', '由', ':', '2013', '年', '12', '月', '3', '日', ',', '阮', 'x4', '要', '求', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '为', '其', '在', '中', '国', '农', '业', '银', '行', '股', '份', '有', '限', '公', '司', '安', '徽', '省', '分', '行', '营', '业', '部', '(', '以', '下', '简', '称', '农', '行', '安', '徽', '省', '分', '行', ')', '的', '94', '##000', '元', '贷', '款', '提', '供', '个', '人', '消', '费', '信', '贷', '保', '证', '保', '险', ',', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '接', '受', '投', '保', '并', '签', '发', '保', '险', '单', '(', '保', '单', '编', '号', ':', '125', '##94', '##07', '##26', '##000', '##010', '##87', '##10', '##3', ')', ',', '保', '费', '缴', '纳', '方', '式', '为', '期', '缴', ',', '每', '期', '(', '月', ')', '178', '##6', '元', ',', '保', '险', '期', '间', '自', '个', '人', '消', '费', '信', '贷', '合', '同', '项', '下', '贷', '款', '发', '放', '之', '日', '起', '至', '清', '偿', '全', '部', '贷', '款', '本', '息', '之', '日', '止', '该', '保', '单', '约', '定', ',', '阮', 'x4', '拖', '欠', '任', '何', '一', '期', '贷', '款', '达', '到', '80', '天', '的', ',', '视', '为', '保', '险', '事', '故', '发', '生', ',', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '依', '据', '保', '险', '合', '同', '约', '定', '对', '农', '行', '安', '徽', '省', '分', '行', '进', '行', '赔', '偿', ';', '从', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '赔', '偿', '当', '日', '起', '开', '始', '超', '过', '30', '天', ',', '投', '保', '人', '仍', '未', '向', '保', '险', '人', '归', '还', '全', '部', '赔', '偿', '款', '项', '的', ',', '则', '视', '为', '投', '保', '人', '违', '约', ',', '投', '保', '人', '需', '以', '尚', '欠', '全', '部', '款', '项', '为', '基', '数', ',', '从', '保', '险', '人', '赔', '偿', '当', '日', '开', '始', '计', '算', ',', '按', '每', '日', '千', '分', '之', '一', '标', '准', ',', '向', '保', '险', '人', '缴', '纳', '违', '约', '金', '阮', 'x4', '投', '保', '后', ',', '从', '农', '行', '安', '徽', '省', '分', '行', '获', '得', '小', '额', '消', '费', '贷', '款', '共', '94', '##000', '元', ',', '贷', '款', '用', '途', '为', '日', '常', '生', '活', '消', '费', ',', '借', '款', '期', '限', '为', '36', '个', '月', ',', '按', '月', '等', '额', '本', '息', '还', '款', '后', '因', '阮', 'x4', '未', '完', '全', '履', '行', '合', '同', '约', '定', '的', '还', '款', '义', '务', ',', '农', '行', '安', '徽', '省', '分', '行', '于', '2015', '年', '2', '月', '25', '日', '向', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '申', '请', '索', '赔', ',', '平', '安', '财', '险', '安', '徽', '分', '[SEP]']
token_to_orig_map: {0: -1, 1: -1, 2: -1, 24: 124, 25: 125, 26: 126, 27: 127, 28: 128, 29: 129, 30: 130, 31: 131, 32: 132, 33: 133, 34: 134, 35: 135, 36: 136, 37: 137, 38: 138, 39: 139, 40: 140, 41: 141, 42: 142, 43: 143, 44: 144, 45: 145, 46: 146, 47: 147, 48: 148, 49: 149, 50: 150, 51: 151, 52: 152, 53: 152, 54: 153, 55: 154, 56: 155, 57: 156, 58: 157, 59: 158, 60: 159, 61: 160, 62: 161, 63: 162, 64: 163, 65: 164, 66: 165, 67: 166, 68: 167, 69: 168, 70: 169, 71: 170, 72: 171, 73: 172, 74: 173, 75: 174, 76: 175, 77: 176, 78: 177, 79: 178, 80: 179, 81: 180, 82: 181, 83: 182, 84: 183, 85: 184, 86: 185, 87: 186, 88: 187, 89: 188, 90: 189, 91: 190, 92: 191, 93: 192, 94: 193, 95: 194, 96: 195, 97: 196, 98: 197, 99: 198, 100: 199, 101: 200, 102: 201, 103: 202, 104: 203, 105: 204, 106: 205, 107: 206, 108: 207, 109: 208, 110: 209, 111: 210, 112: 211, 113: 212, 114: 213, 115: 214, 116: 215, 117: 216, 118: 217, 119: 218, 120: 219, 121: 220, 122: 221, 123: 222, 124: 223, 125: 224, 126: 225, 127: 226, 128: 227, 129: 228, 130: 229, 131: 230, 132: 231, 133: 232, 134: 233, 135: 234, 136: 234, 137: 235, 138: 236, 139: 237, 140: 238, 141: 239, 142: 240, 143: 241, 144: 242, 145: 243, 146: 244, 147: 245, 148: 246, 149: 247, 150: 248, 151: 249, 152: 250, 153: 251, 154: 252, 155: 253, 156: 254, 157: 255, 158: 256, 159: 257, 160: 258, 161: 259, 162: 260, 163: 261, 164: 262, 165: 263, 166: 264, 167: 265, 168: 266, 169: 267, 170: 268, 171: 269, 172: 270, 173: 271, 174: 272, 175: 273, 176: 274, 177: 275, 178: 276, 179: 276, 180: 276, 181: 276, 182: 276, 183: 276, 184: 276, 185: 276, 186: 276, 187: 277, 188: 278, 189: 279, 190: 280, 191: 281, 192: 282, 193: 283, 194: 284, 195: 285, 196: 286, 197: 287, 198: 288, 199: 289, 200: 290, 201: 291, 202: 292, 203: 293, 204: 294, 205: 294, 206: 295, 207: 296, 208: 297, 209: 298, 210: 299, 211: 300, 212: 301, 213: 302, 214: 303, 215: 304, 216: 305, 217: 306, 218: 307, 219: 308, 220: 309, 221: 310, 222: 311, 223: 312, 224: 313, 225: 314, 226: 315, 227: 316, 228: 317, 229: 318, 230: 319, 231: 320, 232: 321, 233: 322, 234: 323, 235: 324, 236: 325, 237: 326, 238: 327, 239: 328, 240: 329, 241: 330, 242: 331, 243: 332, 244: 333, 245: 334, 246: 335, 247: 336, 248: 337, 249: 338, 250: 339, 251: 340, 252: 341, 253: 342, 254: 343, 255: 344, 256: 345, 257: 346, 258: 347, 259: 348, 260: 349, 261: 350, 262: 351, 263: 352, 264: 353, 265: 354, 266: 355, 267: 356, 268: 357, 269: 358, 270: 359, 271: 360, 272: 361, 273: 362, 274: 363, 275: 364, 276: 365, 277: 366, 278: 367, 279: 368, 280: 369, 281: 370, 282: 371, 283: 372, 284: 373, 285: 374, 286: 375, 287: 376, 288: 377, 289: 378, 290: 379, 291: 380, 292: 381, 293: 382, 294: 383, 295: 384, 296: 385, 297: 386, 298: 387, 299: 388, 300: 389, 301: 390, 302: 391, 303: 392, 304: 393, 305: 394, 306: 395, 307: 396, 308: 397, 309: 398, 310: 399, 311: 400, 312: 401, 313: 402, 314: 403, 315: 404, 316: 405, 317: 406, 318: 407, 319: 408, 320: 409, 321: 410, 322: 411, 323: 412, 324: 413, 325: 414, 326: 415, 327: 416, 328: 417, 329: 418, 330: 419, 331: 420, 332: 421, 333: 422, 334: 423, 335: 424, 336: 425, 337: 426, 338: 427, 339: 428, 340: 429, 341: 430, 342: 431, 343: 432, 344: 433, 345: 434, 346: 435, 347: 436, 348: 437, 349: 438, 350: 439, 351: 440, 352: 441, 353: 442, 354: 443, 355: 444, 356: 445, 357: 446, 358: 447, 359: 448, 360: 449, 361: 450, 362: 451, 363: 452, 364: 453, 365: 454, 366: 455, 367: 456, 368: 457, 369: 458, 370: 459, 371: 460, 372: 461, 373: 462, 374: 463, 375: 464, 376: 465, 377: 466, 378: 467, 379: 468, 380: 469, 381: 470, 382: 471, 383: 472, 384: 473, 385: 474, 386: 475, 387: 476, 388: 477, 389: 478, 390: 479, 391: 480, 392: 481, 393: 482, 394: 483, 395: 484, 396: 485, 397: 486, 398: 487, 399: 488, 400: 489, 401: 490, 402: 491, 403: 492, 404: 493, 405: 494, 406: 495, 407: 496, 408: 497, 409: 498, 410: 499, 411: 500, 412: 501, 413: 502, 414: 503, 415: 504, 416: 505, 417: 506, 418: 507, 419: 508, 420: 509, 421: 510, 422: 511, 423: 512, 424: 512, 425: 513, 426: 514, 427: 515, 428: 516, 429: 517, 430: 518, 431: 519, 432: 520, 433: 521, 434: 522, 435: 523, 436: 524, 437: 525, 438: 526, 439: 527, 440: 528, 441: 529, 442: 530, 443: 531, 444: 532, 445: 533, 446: 534, 447: 535, 448: 536, 449: 537, 450: 538, 451: 539, 452: 540, 453: 541, 454: 542, 455: 543, 456: 544, 457: 545, 458: 546, 459: 547, 460: 548, 461: 549, 462: 550, 463: 551, 464: 552, 465: 553, 466: 554, 467: 555, 468: 556, 469: 557, 470: 558, 471: 559, 472: 560, 473: 561, 474: 562, 475: 563, 476: 564, 477: 565, 478: 566, 479: 567, 480: 568, 481: 569, 482: 570, 483: 571, 484: 572, 485: 573, 486: 574, 487: 575, 488: 576, 489: 577, 490: 578, 491: 579, 492: 580, 493: 581, 494: 582, 495: 583, 496: 584, 497: 585, 498: 586, 499: 587, 500: 588, 501: 589, 502: 590, 503: 591, 504: 592, 505: 593, 506: 594, 507: 595, 508: 596, 509: 597, 510: 598}
start_positions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
end_positions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
is_impossible: False
>

最后需要注意的是:怎么将是否类、单个片段类和多个片段类进行统一的标识,以及输入中的token_type_ids是怎么进行设置的。 最有有一个处理文件将处理后的数据保存下来,避免每次重复进行处理,消耗时间:

代码语言:javascript
复制
"""
数据处理相关代码
"""
import argparse
import json

from transformers import PreTrainedTokenizer, BertTokenizer
from data_process_utils import *
import gzip
import pickle
import os
from os.path import join
import logging


def convert_and_write(args, tokenizer: PreTrainedTokenizer, file, examples_fn, features_fn, is_training):
    logging.info(f"Reading examples from :{file} ...")
    example_list = read_examples(file, is_training=is_training)
    logging.info(f"Total examples:{len(example_list)}")

    logging.info(f"Start converting examples to features.")
    feature_list = convert_examples_to_features(example_list, tokenizer, args, is_training)
    logging.info(f"Total features:{len(feature_list)}")

    logging.info(f"Converting complete, writing examples and features to file.")
    with gzip.open(join(args.output_path, examples_fn), "wb") as file:
        pickle.dump(example_list, file)
    with gzip.open(join(args.output_path, features_fn), "wb") as file:
        pickle.dump(feature_list, file)


def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--input_file",
        type=str,
        required=True,
        help="The file to be processed."
    )

    parser.add_argument(
        "--for_training",
        action="store_true",
        help="Process for training or not."
    )

    parser.add_argument(
        "--output_prefix",
        type=str,
        required=True,
        help="The prefix of output file's name."
    )

    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model."
    )

    parser.add_argument(
        "--tokenizer_path",
        type=str,
        required=True,
        help="Path to tokenizer which will be used to tokenize text.(ElectraTokenizer)"
    )

    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. "
             "Longer will be truncated, and shorter will be padded."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer will be truncated to the length."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks."
    )

    parser.add_argument(
        "--output_path",
        default="./processed_data/",
        type=str,
        help="Output path of the constructed examples and features."
    )

    args = parser.parse_args()
    args.max_query_length += 2  # position for token yes and no
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
    )

    logging.info("All input parameters:")
    print(json.dumps(vars(args), sort_keys=False, indent=2))

    tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path)

    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    convert_and_write(args, tokenizer, args.input_file, args.output_prefix + "_examples.pkl.gz",
                      args.output_prefix + "_features.pkl.gz", args.for_training)


if __name__ == "__main__":
    main()

运行指令:

代码语言:javascript
复制
python data_process.py --input_file data_sample/cail2021_mrc_small.json --output_prefix cail2021_mrc_small --tokenizer_path model_hub/chinese-bert-wwm-ext --max_seq_length 512 --max_query_length 64 --doc_stride 128 --do_lower_case --for_training
本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2022-06-09,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档