当我运行demo.py时
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = AutoModel.from_pretrained("distilbert-base-multilingual-cased", return_dict=True)
# print(model)
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))
inputs = tokenizer("史密斯先生不在,他去看电影了。Mr Smith is not in. He ________ ________to the cinema", return_tensors="pt")
print(inputs)
outputs = model(**inputs)
print(outputs)
代码显示
{'input_ids': tensor([[ 101, 2759, 3417, 4332, 2431, 5600, 2080, 3031, 10064, 2196,
2724, 5765, 5614, 3756, 2146, 1882, 12916, 11673, 10124, 10472,
10106, 119, 10357, 168, 168, 168, 168, 168, 168, 168,
168, 168, 168, 168, 168, 168, 168, 168, 168, 10114,
10105, 18458, 119, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
使用bos_token,但尚未设置。使用eos_token,但还没有设置。为什么要打印bos_token?
发布于 2020-12-21 09:40:09
__call__
method of the tokenizer有一个默认为True
的属性add_special_tokens
。这意味着在开头添加BOS (句子的开头)令牌,在末尾添加EOS (句子的结尾)令牌。如果不想使用这些符号,可以将add_special_tokens
设置为False
。
但是,请注意,如果模型使用与训练时相同的标记化和特殊符号,则效果最好。从你的例子来看,在我看来,你想用不同语言的一对句子来给模型喂食。这类对通常由一个特殊的令牌[SEP]
分隔。因此,您可能希望使用标记器的encode_plus
method,它可以为您正确地编码句子对。
发布于 2022-08-15 13:54:38
我认为这是正确的方法。如果没有,请告诉我:
def add_special_all_special_tokens(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
"""
special_tokens_dict = {"cls_token": "<CLS>"}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print("We have added", num_added_toks, "tokens")
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
assert tokenizer.cls_token == "<CLS>"
"""
original_len: int = len(tokenizer)
num_added_toks: dict = {}
if tokenizer.bos_token is None:
num_added_toks['bos_token'] = "<bos>"
if tokenizer.bos_token is None:
num_added_toks['cls_token'] = "<cls>"
if tokenizer.bos_token is None:
num_added_toks['sep_token'] = "<s>"
if tokenizer.bos_token is None:
num_added_toks['mask_token'] = "<mask>"
# num_added_toks = {"bos_token": "<bos>", "cls_token": "<cls>", "sep_token": "<s>", "mask_token": "<mask>"}
# special_tokens_dict = {'additional_special_tokens': new_special_tokens + tokenizer.all_special_tokens}
num_new_tokens: int = tokenizer.add_special_tokens(num_added_toks)
assert tokenizer.bos_token == "<bos>"
assert tokenizer.cls_token == "<cls>"
assert tokenizer.sep_token == "<s>"
assert tokenizer.mask_token == "<mask>"
err_msg = f"Error, not equal: {len(tokenizer)=}, {original_len + num_new_tokens=}"
assert len(tokenizer) == original_len + num_new_tokens, err_msg
https://stackoverflow.com/questions/65387101
复制相似问题