我在训练一只伯特。
有人能解释一下以下错误消息的含义吗?
Condition x == y did not hold element wise
这是参考colab notebook
我的代码是:
!pip install bert-for-tf2
import math
import os
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
%%time
bert_ckpt_dir="gs://bert_models/2018_10_18/uncased_L-12_H-768_A-12/"
bert_ckpt_file = bert_ckpt_dir + "bert_model.ckpt"
bert_config_file = bert_ckpt_dir + "bert_config.json"
bert_model_dir="2018_10_18"
bert_model_name="uncased_L-12_H-768_A-12"
!mkdir -p .model .model/$bert_model_name
for fname in ["bert_config.json", "vocab.txt", "bert_model.ckpt.meta", "bert_model.ckpt.index", "bert_model.ckpt.data-00000-of-00001"]:
cmd = f"gsutil cp gs://bert_models/{bert_model_dir}/{bert_model_name}/{fname} .model/{bert_model_name}"
!$cmd
!ls -la .model .model/$bert_model_name
bert_ckpt_dir = os.path.join(".model/",bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
class FakeNewsData:
"""
將本文預處理為Bert功能。
max_seq_len:指定的最大序列長度
標記器:Bert標記器
"""
DATA_COLUMN = "text"
LABEL_COLUMN = "label"
def __init__(self, tokenizer, train, validation, test, max_seq_len = 150):
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
((self.train_x, self.train_y),
(self.val_x, self.val_y),
(self.test_x, self.test_y)) = map(self._prepare, [train, validation, test])
((self.train_x, self.train_x_token_types),
(self.val_x, self.val_x_token_types),
(self.test_x, self.test_x_token_types)) = map(self._pad,
[self.train_x, self.val_x, self.test_x])
def _prepare(self, df):
"""
為每個序列添加開始和結束標記,並將本文轉換為標記ID。
"""
x, y = [], []
with tqdm(total=df.shape[0], unit_scale=True) as pbar:
for ndx, row in df.iterrows():
text, label = row[FakeNewsData.DATA_COLUMN], row[FakeNewsData.LABEL_COLUMN]
tokens = self.tokenizer.tokenize(text)
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
self.max_seq_len = max(self.max_seq_len, len(token_ids))
x.append(token_ids)
y.append(int(label))
pbar.update()
return np.array(x), np.array(y)
def _pad(self, ids):
"""
用[0] 將每個序列填充到指定的最大序列長度
"""
x, t = [], []
token_type_ids = [0] * self.max_seq_len
for input_ids in ids:
input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
x.append(np.array(input_ids))
t.append(token_type_ids)
return np.array(x), np.array(t)
%%time
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
data = FakeNewsData(tokenizer,
train = train_df,
validation = val_df,
test = test_df,
max_seq_len= 150)
def create_model(max_seq_len,lr = 1e-5):
"""
創建一個Bert分類模型。
模型architecutre是原始輸入->Bert輸入->退出層,以防止過度擬合->密度層,輸出預測的概率。
max_seq_len:最大序列長度
lr:優化器的學習率
"""
# create the bert layer
with tf.io.gfile.GFile(bert_config_file, "r") as reader:
bc = StockBertConfig.from_json_string(reader.read())
bert_params = map_stock_config_to_params(bc)
bert = BertModelLayer.from_params(bert_params, name="bert")
input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
output = bert(input_ids)
print("bert shape", output.shape)
cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
# Dropout layer
cls_out = keras.layers.Dropout(0.8)(cls_out)
# Dense layer with probibility output
logits = keras.layers.Dense(units=2, activation="softmax")(cls_out)
model = keras.Model(inputs=input_ids, outputs=logits)
model.build(input_shape=(None, max_seq_len))
# load the pre-trained model weights
load_stock_weights(bert, bert_ckpt_file)
model.compile(optimizer=keras.optimizers.Adam(learning_rate = lr),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])
model.summary()
return model
model = create_model(max_seq_len = data.max_seq_len, lr = 1e-5)
以下是输出
InvalidArgumentError Traceback (most recent call last)
<ipython-input-25-578d63d49a0e> in <module>()
39 return model
40
---> 41 model = create_model(max_seq_len = data.max_seq_len, lr = 1e-5)
3 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
256 except Exception as e: # pylint:disable=broad-except
257 if hasattr(e, 'ag_error_metadata'):
--> 258 raise e.ag_error_metadata.to_exception(e)
259 else:
260 raise
InvalidArgumentError: in user code:
/usr/local/lib/python3.6/dist-packages/bert/model.py:79 call *
embedding_output = self.embeddings_layer(inputs, mask=mask, training=training)
/usr/local/lib/python3.6/dist-packages/bert/embeddings.py:223 call *
pos_embeddings = self.position_embeddings_layer(seq_len)
/usr/local/lib/python3.6/dist-packages/bert/embeddings.py:48 call *
assert_op = tf.compat.v2.debugging.assert_less_equal(seq_len, self.params.max_position_embeddings)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper **
return target(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/check_ops.py:938 assert_less_equal_v2
summarize=summarize, message=message, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper
return target(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/check_ops.py:947 assert_less_equal
np.less_equal, x, y, data, summarize, message, name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/check_ops.py:372 _binary_assert
_assert_static(condition_static, data)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/check_ops.py:87 _assert_static
message='\n'.join(data_static))
InvalidArgumentError: Condition x <= y did not hold element-wise:
x (bert/embeddings/Const_2:0) =
9755
y (bert/embeddings/position_embeddings/assert_less_equal_2/y:0) =
512
长文档预处理
def get_split(text):
"""
Split each news text to subtexts no longer than 150 words.
"""
l_total = []
l_parcial = []
if len(text.split())//120 >0:
n = len(text.split())//120
else:
n = 1
for w in range(n):
if w == 0:
l_parcial = text.split()[:150]
l_total.append(" ".join(l_parcial))
else:
l_parcial = text.split()[w*120:w*120 + 150]
l_total.append(" ".join(l_parcial))
return l_total
train['text_split'] = train['text'].apply(get_split)
val['text_split'] = val['text'].apply(get_split)
test['text_split'] = test['text'].apply(get_split)
def data_augumentation(df, df_name):
"""
Create a new dataframe from the original one because now one text may contain multiple subtexts of length 200.
Text correspond to subtexts from original text, while index correspond to its index of original set.
"""
text_l = []
label_l = []
index_l = []
for idx,row in df.iterrows():
for l in row['text_split']:
text_l.append(l)
label_l.append(row['label'])
index_l.append(idx)
new_df = pd.DataFrame({'text':text_l, 'label':label_l, 'index':index_l})
print("The " + df_name +" set now has " + str(len(new_df)) + ' subtexts extracted from ' + str(len(df)) + ' texts.')
return new_df
train_df = data_augumentation(train, df_name = 'training')
val_df = data_augumentation(val, df_name = 'validation')
test_df = data_augumentation(test, df_name = 'testing')
我将短序列提供给我的模型。它起作用了。长序列是否来自长文档预处理?我找不到问题所在。请帮帮我。谢谢。
发布于 2020-12-01 16:01:02
BERT的最大序列长度限制为512。试着给你的模型提供短序列。如果它工作了-检查你的数据:在某个地方有一个很长的序列。
https://stackoverflow.com/questions/65085991
复制相似问题