1
votes

UPDATE: Testing the same code with tensorflow-gpu 1.13.1 works both on my PC and on Google Cloud.


Using TensorFlow Estimator and running train_and_evaluate gives me the following error message:

"ValueError: Tensor("Const:0", shape=(3,), dtype=float32) must be from the same graph as Tensor("ParallelMapDataset:0", shape=(), dtype=variant, device=/device:CPU:0)." (see the full error output near bottom)

This happens when training the CNN on my PC with a GPU (GeForge RTX 2070). I am using Python 3.7 with tensorflow-gpu/tensorflow 1.14.0, Keras 2.2.4, running in a Conda environment.

It happens after the following log message "... Saving checkpoints for 2716 into C:/EstimatorOutput/10/model.ckpt." and appear to be when the input function for the evaluation step is being processed.

The code, as it is now, has run previously with no issue, but this has suddenly changed for reasons that are unclear to me.

I ran similar code on Google Cloud (which also previously ran fine), and the same problem occur (see error output near bottom; Run on GPU (BASIC_GPU); TensorFlow 1.14; Keras 2.2.4)

The error seems to be related to the evaluation step when the graph is created for some reason the new graph is not compatible.

Here is my code - >

My task module:

import tensorflow as tf
from train_model import model #("train_model" is local folder)
from train_model.model import create_estimator 

if __name__ == '__main__':

    model_num = 10

    # Throw properties into params dict to pass to other functions
    params = {}
    params['train csv'] = "train_set_local.csv"
    params['eval csv'] = "eval_set_local.csv"
    params['output path'] = "C:/EstimatorOutput/" + str(model_num) + "/"
    params['data path'] = "C:/Databases/Birds_dB/Images"
    params['image size'] = [244, 224]
    params["batch size"] = 16*2
    params['use random flip'] = True
    params['learning rate'] = 0.000001  
    params['dropout rate'] = 0.50
    params['num classes'] = 123
    params['train steps'] = 65000
    params['eval steps'] = 20
    params['eval_throttle_secs'] = 600
    params['num parallel calls'] = 4

    # Run the training job
    model.go_train(params) # (See "go_train" below in model script ->)

My model module

import tensorflow as tf
from tensorflow.python.keras import estimator as kes
from tensorflow.python.keras.applications.vgg16 import VGG16
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dropout, Flatten, Dense
from train_model.input_fn import make_input_fn


def create_estimator(params):
    # Import VGG16 model for transfer learning
    base_model = VGG16(weights='imagenet')
    base_model.summary()

    x = base_model.get_layer('fc2').output

    x = Dropout(params['dropout rate'])(x)

    predictions = Dense(params['num classes'], activation="sigmoid", name="sm_out")(x)

    model = Model(inputs=base_model.input, outputs=predictions)

    for layer in model.layers:
        layer.trainable = True

    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.train.AdamOptimizer(params['learning rate'],
                                         beta1=0.9,
                                         beta2=0.999),
        metrics=["categorical_accuracy"]
    )



    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.95
    run_config = tf.estimator.RunConfig(
            session_config=config,
            model_dir=params['output path']
    )

    # Convert to Estimator
    estimator_model = kes.model_to_estimator(
        keras_model=model,
        config=run_config
    )

    return estimator_model


def go_train(params):
    # Create the estimator
    Est = create_estimator(params)

    # Set up Estimator train and evaluation specifications
    train_spec = tf.estimator.TrainSpec(
        input_fn=make_input_fn(params['train csv'], tf.estimator.ModeKeys.TRAIN, params, augment=True),
        max_steps=params['train steps']
    )
    eval_spec = tf.estimator.EvalSpec(
        input_fn=make_input_fn(params['eval csv'], tf.estimator.ModeKeys.EVAL, params, augment=True),
        steps=params['eval steps'],  # Evaluates on "eval steps" batches
        throttle_secs=params['eval_throttle_secs']
    )


    # Run training and evaluation
    tf.estimator.train_and_evaluate(Est, train_spec, eval_spec)

My input module:

import tensorflow as tf
from keras.applications.vgg16 import preprocess_input

tf.logging.set_verbosity(v=tf.logging.INFO)

HEIGHT = 224
WIDTH = 224
NUM_CHANNELS = 3
NCLASSES = 123


def read_and_preprocess_with_augment(image_bytes, label=None):
    return read_and_preprocess(image_bytes, label, augment=True)


def read_and_preprocess(image_bytes, label=None, augment=False):

    image = tf.image.decode_jpeg(contents=image_bytes, channels=NUM_CHANNELS)
    image = tf.image.convert_image_dtype(image=image, dtype=tf.float32)  # 0-1
    image = tf.expand_dims(input=image, axis=0)  # resize_bilinear needs batches

    if augment:

        # Resize to slightly larger than target size
        image = tf.image.resize_bilinear(images=image, size=[HEIGHT + 50, WIDTH + 50], align_corners=False)

        # Image random rotation
        degree_angle = tf.random.uniform((), minval=-25, maxval=25, dtype=tf.dtypes.float32)
        radian = degree_angle * 3.14 / 180
        image = tf.contrib.image.rotate(image, radian, interpolation='NEAREST')

        # remove batch dimension
        image = tf.squeeze(input=image, axis=0)

        # Random Crop
        image = tf.random_crop(value=image, size=[HEIGHT, WIDTH, NUM_CHANNELS])
        # Random L-R flip
        image = tf.image.random_flip_left_right(image=image)
        # Random brightness
        image = tf.image.random_brightness(image=image, max_delta=63.0 / 255.0)
        # Random contrast
        image = tf.image.random_contrast(image=image, lower=0.2, upper=1.8)

    else:
        image = tf.image.resize_bilinear(images=image, size=[HEIGHT, WIDTH], align_corners=False)
        image = tf.squeeze(input=image, axis=0)  # remove batch dimension

    image = tf.cast(tf.round(image * 255), tf.int32)
    image = preprocess_input(image)

    label = tf.one_hot(tf.strings.to_number(label, out_type=tf.int32), depth=NCLASSES)

    return {"input_1": image}, label


def make_input_fn(csv_of_filenames, mode, params, augment=False):
    def _input_fn():
        def decode_csv(csv_row):
            filename, label = tf.decode_csv(records=csv_row, record_defaults=[[""], [""]])
            image_bytes = tf.read_file(filename=filename)
            return image_bytes, label

        # Create tf.data.dataset from filename
        dataset = tf.data.TextLineDataset(filenames=csv_of_filenames).map(map_func=decode_csv, num_parallel_calls=params['num parallel calls'])

        if augment:
            dataset = dataset.map(map_func=read_and_preprocess_with_augment, num_parallel_calls=params['num parallel calls'])
        else:
            dataset = dataset.map(map_func=read_and_preprocess, num_parallel_calls=params['num parallel calls'])

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None  
            dataset = dataset.shuffle(buffer_size=10*params["batch size"])
        else:
            num_epochs = 1  

        dataset = dataset.repeat(count=num_epochs).batch(batch_size=params["batch size"]).prefetch(4)
        images, labels = dataset.make_one_shot_iterator().get_next()

        return images, labels
    return _input_fn

Error output on PC

As mentioned, the above code when running locally on my GPU results is this series of error messages(abbreviated):

Saving checkpoints for 2716 into .... ... ... File "C:...\estimator.py", line 501, in _evaluate self._evaluate_build_graph(input_fn, hooks, checkpoint_path))

File "C:...\estimator.py", line 1501, in _evaluate_build_graph self._call_model_fn_eval(input_fn, self.config))

File "C:...\estimator.py", line 1534, in _call_model_fn_eval input_fn, ModeKeys.EVAL)

File "C:...\estimator.py", line 1022, in _get_features_and_labels_from_input_fn self._call_input_fn(input_fn, mode))

File "C:...\estimator.py", line 1113, in _call_input_fn return input_fn(**kwargs)

File "C:...\input_fn.py", line 71, in _input_fn dataset = dataset.map(map_func=read_and_preprocess_with_augment, num_parallel_calls=params['num parallel calls'])

File "C:...dataset_ops.py", line 1776, in map self, map_func, num_parallel_calls, preserve_cardinality=False))

File "C:...\dataset_ops.py", line 3239, in init **flat_structure(self))

File "C:...\gen_dataset_ops.py", line 4179, in parallel_map_dataset name=name)

File "C:...\op_def_library.py", line 366, in _apply_op_helper g = ops._get_graph_from_inputs(_Flatten(keywords.values()))

File "C:...\ops.py", line 6135, in _get_graph_from_inputs _assert_same_graph(original_graph_element, graph_element)

File "C:...ops.py", line 6071, in _assert_same_graph (item, original_item))

ValueError: Tensor("Const:0", shape=(3,), dtype=float32) must be from the same graph as Tensor("ParallelMapDataset:0", shape=(), dtype=variant, device=/device:CPU:0).

Error output on Google Cloud

service The replica master 0 exited with a non-zero status of 1. Traceback (most recent call last): [...]

File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1534, in _call_model_fn_eval input_fn, ModeKeys.EVAL)

File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1022, in _get_features_and_labels_from_input_fn self._call_input_fn(input_fn, mode))

File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1113, in _call_input_fn return input_fn(**kwargs)

File "/root/.local/lib/python3.5/site-packages/train_model/input_fn.py", line 87, in _input_fn dataset = dataset.map(map_func=read_and_preprocess_with_augment, num_parallel_calls=params['num parallel calls'])

File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 1776, in map self, map_func, num_parallel_calls, preserve_cardinality=False))

File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 3239, in init **flat_structure(self)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 4179, in parallel_map_dataset name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 366, in _apply_op_helper g = ops._get_graph_from_inputs(_Flatten(keywords.values()))

File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 6135, in _get_graph_from_inputs _assert_same_graph(original_graph_element, graph_element)

File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 6071, in _assert_same_graph (item, original_item))

ValueError: Tensor("Const_1:0", shape=(3,), dtype=float32, device=/device:CPU:0) must be from the same graph as Tensor("ParallelMapDataset:0", shape=(), dtype=variant, device=/device:CPU:0).

Any help/hint is much appreciated. I am stuck at this point and do not know how to debug this one!

1

1 Answers

1
votes

use this preprocess function:

from tensorflow.keras.applications.mobilenet import preprocess_input

It has same functionality to that of VGGs preprocess input.