UPDATE: Testing the same code with tensorflow-gpu 1.13.1 works both on my PC and on Google Cloud.
Using TensorFlow Estimator and running train_and_evaluate gives me the following error message:
"ValueError: Tensor("Const:0", shape=(3,), dtype=float32) must be from the same graph as Tensor("ParallelMapDataset:0", shape=(), dtype=variant, device=/device:CPU:0)." (see the full error output near bottom)
This happens when training the CNN on my PC with a GPU (GeForge RTX 2070). I am using Python 3.7 with tensorflow-gpu/tensorflow 1.14.0, Keras 2.2.4, running in a Conda environment.
It happens after the following log message "... Saving checkpoints for 2716 into C:/EstimatorOutput/10/model.ckpt." and appear to be when the input function for the evaluation step is being processed.
The code, as it is now, has run previously with no issue, but this has suddenly changed for reasons that are unclear to me.
I ran similar code on Google Cloud (which also previously ran fine), and the same problem occur (see error output near bottom; Run on GPU (BASIC_GPU); TensorFlow 1.14; Keras 2.2.4)
The error seems to be related to the evaluation step when the graph is created for some reason the new graph is not compatible.
Here is my code - >
My task module:
import tensorflow as tf
from train_model import model #("train_model" is local folder)
from train_model.model import create_estimator
if __name__ == '__main__':
model_num = 10
# Throw properties into params dict to pass to other functions
params = {}
params['train csv'] = "train_set_local.csv"
params['eval csv'] = "eval_set_local.csv"
params['output path'] = "C:/EstimatorOutput/" + str(model_num) + "/"
params['data path'] = "C:/Databases/Birds_dB/Images"
params['image size'] = [244, 224]
params["batch size"] = 16*2
params['use random flip'] = True
params['learning rate'] = 0.000001
params['dropout rate'] = 0.50
params['num classes'] = 123
params['train steps'] = 65000
params['eval steps'] = 20
params['eval_throttle_secs'] = 600
params['num parallel calls'] = 4
# Run the training job
model.go_train(params) # (See "go_train" below in model script ->)
My model module
import tensorflow as tf
from tensorflow.python.keras import estimator as kes
from tensorflow.python.keras.applications.vgg16 import VGG16
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dropout, Flatten, Dense
from train_model.input_fn import make_input_fn
def create_estimator(params):
# Import VGG16 model for transfer learning
base_model = VGG16(weights='imagenet')
base_model.summary()
x = base_model.get_layer('fc2').output
x = Dropout(params['dropout rate'])(x)
predictions = Dense(params['num classes'], activation="sigmoid", name="sm_out")(x)
model = Model(inputs=base_model.input, outputs=predictions)
for layer in model.layers:
layer.trainable = True
model.compile(
loss="binary_crossentropy",
optimizer=tf.train.AdamOptimizer(params['learning rate'],
beta1=0.9,
beta2=0.999),
metrics=["categorical_accuracy"]
)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.95
run_config = tf.estimator.RunConfig(
session_config=config,
model_dir=params['output path']
)
# Convert to Estimator
estimator_model = kes.model_to_estimator(
keras_model=model,
config=run_config
)
return estimator_model
def go_train(params):
# Create the estimator
Est = create_estimator(params)
# Set up Estimator train and evaluation specifications
train_spec = tf.estimator.TrainSpec(
input_fn=make_input_fn(params['train csv'], tf.estimator.ModeKeys.TRAIN, params, augment=True),
max_steps=params['train steps']
)
eval_spec = tf.estimator.EvalSpec(
input_fn=make_input_fn(params['eval csv'], tf.estimator.ModeKeys.EVAL, params, augment=True),
steps=params['eval steps'], # Evaluates on "eval steps" batches
throttle_secs=params['eval_throttle_secs']
)
# Run training and evaluation
tf.estimator.train_and_evaluate(Est, train_spec, eval_spec)
My input module:
import tensorflow as tf
from keras.applications.vgg16 import preprocess_input
tf.logging.set_verbosity(v=tf.logging.INFO)
HEIGHT = 224
WIDTH = 224
NUM_CHANNELS = 3
NCLASSES = 123
def read_and_preprocess_with_augment(image_bytes, label=None):
return read_and_preprocess(image_bytes, label, augment=True)
def read_and_preprocess(image_bytes, label=None, augment=False):
image = tf.image.decode_jpeg(contents=image_bytes, channels=NUM_CHANNELS)
image = tf.image.convert_image_dtype(image=image, dtype=tf.float32) # 0-1
image = tf.expand_dims(input=image, axis=0) # resize_bilinear needs batches
if augment:
# Resize to slightly larger than target size
image = tf.image.resize_bilinear(images=image, size=[HEIGHT + 50, WIDTH + 50], align_corners=False)
# Image random rotation
degree_angle = tf.random.uniform((), minval=-25, maxval=25, dtype=tf.dtypes.float32)
radian = degree_angle * 3.14 / 180
image = tf.contrib.image.rotate(image, radian, interpolation='NEAREST')
# remove batch dimension
image = tf.squeeze(input=image, axis=0)
# Random Crop
image = tf.random_crop(value=image, size=[HEIGHT, WIDTH, NUM_CHANNELS])
# Random L-R flip
image = tf.image.random_flip_left_right(image=image)
# Random brightness
image = tf.image.random_brightness(image=image, max_delta=63.0 / 255.0)
# Random contrast
image = tf.image.random_contrast(image=image, lower=0.2, upper=1.8)
else:
image = tf.image.resize_bilinear(images=image, size=[HEIGHT, WIDTH], align_corners=False)
image = tf.squeeze(input=image, axis=0) # remove batch dimension
image = tf.cast(tf.round(image * 255), tf.int32)
image = preprocess_input(image)
label = tf.one_hot(tf.strings.to_number(label, out_type=tf.int32), depth=NCLASSES)
return {"input_1": image}, label
def make_input_fn(csv_of_filenames, mode, params, augment=False):
def _input_fn():
def decode_csv(csv_row):
filename, label = tf.decode_csv(records=csv_row, record_defaults=[[""], [""]])
image_bytes = tf.read_file(filename=filename)
return image_bytes, label
# Create tf.data.dataset from filename
dataset = tf.data.TextLineDataset(filenames=csv_of_filenames).map(map_func=decode_csv, num_parallel_calls=params['num parallel calls'])
if augment:
dataset = dataset.map(map_func=read_and_preprocess_with_augment, num_parallel_calls=params['num parallel calls'])
else:
dataset = dataset.map(map_func=read_and_preprocess, num_parallel_calls=params['num parallel calls'])
if mode == tf.estimator.ModeKeys.TRAIN:
num_epochs = None
dataset = dataset.shuffle(buffer_size=10*params["batch size"])
else:
num_epochs = 1
dataset = dataset.repeat(count=num_epochs).batch(batch_size=params["batch size"]).prefetch(4)
images, labels = dataset.make_one_shot_iterator().get_next()
return images, labels
return _input_fn
Error output on PC
As mentioned, the above code when running locally on my GPU results is this series of error messages(abbreviated):
Saving checkpoints for 2716 into .... ... ... File "C:...\estimator.py", line 501, in _evaluate self._evaluate_build_graph(input_fn, hooks, checkpoint_path))
File "C:...\estimator.py", line 1501, in _evaluate_build_graph self._call_model_fn_eval(input_fn, self.config))
File "C:...\estimator.py", line 1534, in _call_model_fn_eval input_fn, ModeKeys.EVAL)
File "C:...\estimator.py", line 1022, in _get_features_and_labels_from_input_fn self._call_input_fn(input_fn, mode))
File "C:...\estimator.py", line 1113, in _call_input_fn return input_fn(**kwargs)
File "C:...\input_fn.py", line 71, in _input_fn dataset = dataset.map(map_func=read_and_preprocess_with_augment, num_parallel_calls=params['num parallel calls'])
File "C:...dataset_ops.py", line 1776, in map self, map_func, num_parallel_calls, preserve_cardinality=False))
File "C:...\dataset_ops.py", line 3239, in init **flat_structure(self))
File "C:...\gen_dataset_ops.py", line 4179, in parallel_map_dataset name=name)
File "C:...\op_def_library.py", line 366, in _apply_op_helper g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
File "C:...\ops.py", line 6135, in _get_graph_from_inputs _assert_same_graph(original_graph_element, graph_element)
File "C:...ops.py", line 6071, in _assert_same_graph (item, original_item))
ValueError: Tensor("Const:0", shape=(3,), dtype=float32) must be from the same graph as Tensor("ParallelMapDataset:0", shape=(), dtype=variant, device=/device:CPU:0).
Error output on Google Cloud
service The replica master 0 exited with a non-zero status of 1. Traceback (most recent call last): [...]
File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1534, in _call_model_fn_eval input_fn, ModeKeys.EVAL)
File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1022, in _get_features_and_labels_from_input_fn self._call_input_fn(input_fn, mode))
File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1113, in _call_input_fn return input_fn(**kwargs)
File "/root/.local/lib/python3.5/site-packages/train_model/input_fn.py", line 87, in _input_fn dataset = dataset.map(map_func=read_and_preprocess_with_augment, num_parallel_calls=params['num parallel calls'])
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 1776, in map self, map_func, num_parallel_calls, preserve_cardinality=False))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 3239, in init **flat_structure(self)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 4179, in parallel_map_dataset name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 366, in _apply_op_helper g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 6135, in _get_graph_from_inputs _assert_same_graph(original_graph_element, graph_element)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 6071, in _assert_same_graph (item, original_item))
ValueError: Tensor("Const_1:0", shape=(3,), dtype=float32, device=/device:CPU:0) must be from the same graph as Tensor("ParallelMapDataset:0", shape=(), dtype=variant, device=/device:CPU:0).
Any help/hint is much appreciated. I am stuck at this point and do not know how to debug this one!