I have some issue with the TPUv2 regarding the memory usage. I would like to do some experiment with some Large model but unfortunately the model does not fit the memory. I would like to use bfloat16 in order to save some memory but I have some issue when I call the model :
try:
# TPU detection. No parameters necessary if TPU_NAME environment variable is
# set: this is always the case on Kaggle.
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
print('Running on TPU ', resolver.master())
except ValueError:
resolver = None
if resolver:
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
else:
# Default distribution strategy in Tensorflow. Works on CPU and single GPU.
strategy = tf.distribute.get_strategy()
policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
tf.keras.mixed_precision.experimental.set_policy(policy)
with strategy.scope():
model = CustomModel(TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-large"), num_classes=5)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
optimizer = tf.mixed_precision.LossScaleOptimizer(optimizer, loss_scale='dynamic')
model.compile(optimizer=optimizer,loss=['mse'])
InvalidArgumentError Traceback (most recent call last)
in () 3 with strategy.scope(): 4 ----> 5 model = CustomModel(TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-large"), num_classes=5) 6 optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) 7 optimizer = tf.mixed_precision.LossScaleOptimizer(optimizer, loss_scale='dynamic')
13 frames
/usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs) 399 return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True) 400 --> 401 model(model.dummy_inputs, training=False) # build the network with dummy inputs 402 403 assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in call(self, *args, **kwargs) 966 with base_layer_utils.autocast_context_manager( 967 self._compute_dtype): --> 968 outputs = self.call(cast_inputs, *args, **kwargs) 969 self._handle_activity_regularization(inputs, outputs) 970 self._set_mask_metadata(inputs, outputs, input_masks)
/usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_roberta.py in call(self, inputs, **kwargs) 222 223 """ --> 224 outputs = self.roberta(inputs, **kwargs) 225 return outputs 226
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in call(self, *args, **kwargs) 966 with base_layer_utils.autocast_context_manager( 967 self._compute_dtype): --> 968 outputs = self.call(cast_inputs, *args, **kwargs) 969 self._handle_activity_regularization(inputs, outputs) 970 self._set_mask_metadata(inputs, outputs, input_masks)
/usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_bert.py in call(self, inputs, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training) 567 # head_mask = tf.constant([0] * self.num_hidden_layers) 568 --> 569 embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) 570 encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) 571
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in call(self, *args, **kwargs) 966 with base_layer_utils.autocast_context_manager( 967 self._compute_dtype): --> 968 outputs = self.call(cast_inputs, *args, **kwargs) 969 self._handle_activity_regularization(inputs, outputs) 970 self._set_mask_metadata(inputs, outputs, input_masks)
/usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_bert.py in call(self, inputs, mode, training) 146 """ 147 if mode == "embedding": --> 148 return self._embedding(inputs, training=training) 149 elif mode == "linear": 150 return self._linear(inputs)
/usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_roberta.py in _embedding(self, inputs, training) 79 position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) 80 ---> 81 return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) 82 83
/usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_bert.py in _embedding(self, inputs, training) 173 174 embeddings = inputs_embeds + position_embeddings + token_type_embeddings --> 175 embeddings = self.LayerNorm(embeddings) 176 embeddings = self.dropout(embeddings, training=training) 177 return embeddings
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in call(self, *args, **kwargs) 962 # Eager execution on data tensors. 963 with backend.name_scope(self._name_scope()): --> 964 self._maybe_build(inputs) 965 cast_inputs = self._maybe_cast_inputs(inputs) 966 with base_layer_utils.autocast_context_manager(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in _maybe_build(self, inputs) 2406 self._dtype_policy = policy.Policy(dtype) 2407 input_shapes = None -> 2408 if all(hasattr(x, 'shape') for x in input_list): 2409 input_shapes = nest.map_structure(lambda x: x.shape, inputs) 2410
Only call
build
if the user has manually overridden the build method./usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in (.0) 2406 self._dtype_policy = policy.Policy(dtype) 2407 input_shapes = None -> 2408 if all(hasattr(x, 'shape') for x in input_list): 2409 input_shapes = nest.map_structure(lambda x: x.shape, inputs) 2410
Only call
build
if the user has manually overridden the build method./usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py in shape(self) 1065 self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple()) 1066 except core._NotOkStatusException as e: -> 1067 six.raise_from(core._status_to_exception(e.code, e.message), None) 1068 1069 return self._tensor_shape
/usr/local/lib/python3.6/dist-packages/six.py in raise_from(value, from_value)
InvalidArgumentError: cannot compute AddV2 as input #1(zero-based) was expected to be a bfloat16 tensor but is a float tensor
I suppose I have to cast something regarding the model ? How can I do that ? I am using tensorflow 2.1 and TPU v2. I have see this thread but it was with tensorflow 1.X I suppose as the code does not work for me. Memory reduction Tensorflow TPU v2/v3 bfloat16