0
votes

Output feature map of a convolution layer is (Batch, Height, Width, Channels). When we initialize the CNN in tensorflow we get None value in place of Batch. I am trying to implement Spatial Transformer Network in custom layer, so to vectorize the layer as Convolution Layer Batch Size is required. When I try to initialize the network the Spatial Transformer Layer is giving the error that operations cant be performed with None value.

My code is show below

    class SpatialTransformer(Layer):
      def __init__(self):
        super(SpatialTransformer, self).__init__()

      def affine_transform(self, input_shape, theta):
        N = theta.shape[0]
        H, W = input_shape  #output dimensions of grid
        x_t, y_t = tf.meshgrid(tf.linspace(-1, 1, W), tf.linspace(-1, 1, H))
        x_t = tf.cast(tf.reshape(x_t, [-1]), dtype = tf.float32)
        y_t = tf.cast(tf.reshape(y_t, [-1]), dtype = tf.float32)
        ones = tf.ones(x_t.shape, dtype=tf.float32)
        sampling_grids = tf.stack([x_t, y_t, ones])
        sampling_grids = tf.expand_dims(sampling_grids, axis = 0)
        sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
        batch_grids = tf.matmul(theta, sampling_grids)
        batch_grids = tf.reshape(batch_grids, [N, 2, H, W])
        return batch_grids

      def get_pixel_value(self, feature_map, x_s, y_s):
        "Util Function to get the value of pixel from 4d image tensors given position vectors x_s and y_s"
        N, H, W = x_s.shape
        batch_idx = tf.range(0, N)
        batch_idx = tf.reshape(batch_idx, (N, 1, 1))
        b = tf.tile(batch_idx, (1, H, W))
        indices = tf.stack([b, y_s, x_s], 3)   #creating indices of shape(N, H, W)
        return tf.gather_nd(feature_map, indices)   #extracting values corresponding to those indices

      def bilinear_sampler(self, feature_map, x, y):
        N, H, W, C = feature_map.shape
        max_y = tf.cast(H - 1, dtype = tf.int32)
        max_x = tf.cast(W - 1, dtype = tf.int32)
        zero = tf.zeros([], dtype= tf.int32)

        x = tf.cast(x, dtype = tf.float32)
        y = tf.cast(y, dtype = tf.float32)    

        #Reshaping the batch grid from [-1, 1] to [0, W-1] and [0, H-1]
        x = (x + 1.0) * tf.cast(max_x, dtype = tf.float32)/2.0
        y = (y + 1.0) * tf.cast(max_y, dtype = tf.float32)/2.0

        #Taking the 4 nearest points to the (x_i, y_i) to perform interpolation
        x0 = tf.cast(tf.floor(x), dtype=tf.int32)
        x1 = x0 + 1
        y0 = tf.cast(tf.floor(y), dtype = tf.int32)
        y1 = y0 + 1

        #clipping the value to be between [0, W-1] or [0, H-1]
        x0 = tf.clip_by_value(x0, zero, max_x)
        x1 = tf.clip_by_value(x1, zero, max_x)
        y0 = tf.clip_by_value(y0, zero, max_y)
        y1 = tf.clip_by_value(y1, zero, max_y)

        #getting pixel values of the corner coordinates(x0,y0), (x0, y1), (x1, y0), (x1, y1)
        Ia = self.get_pixel_value(feature_map, x0, y0)
        Ib = self.get_pixel_value(feature_map, x0, y1)
        Ic = self.get_pixel_value(feature_map, x1, y0)
        Id = self.get_pixel_value(feature_map, x1, y1)

        #Changing the data type to float32
        x0 = tf.cast(x0, dtype = tf.float32)
        x1 = tf.cast(x1, dtype = tf.float32)
        y0 = tf.cast(y0, dtype = tf.float32)
        y1 = tf.cast(y1, dtype = tf.float32)

        #calculating delta (or simply area) weights for interpolation
        Wa = tf.expand_dims((x1-x)*(y1-y), axis=3)
        Wb = tf.expand_dims((x1-x)*(y-y0), axis=3)
        Wc = tf.expand_dims((x-x0)*(y1-y), axis=3)
        Wd = tf.expand_dims((x-x0)*(y-y0), axis=3)
        out = tf.add_n([Wa*Ia, Wb*Ib, Wc*Ic, Wd*Id])
        return out

      def call(self, feature_map, theta, out_size = None):
        N, H, W, _ = feature_map.shape

        if out_size:
          out_H = out_size[0]
          out_W = out_size[1]
          batch_grids = self.affine_transform([out_H, out_W], theta)
        else:
          batch_grids = self.affine_transform([H, W], theta)

        x_s = batch_grids[:,0,:,:]
        y_s = batch_grids[:,0,:,:]

        output_feature_map = self.bilinear_sampler(feature_map, x_s, y_s)
        return output_feature_map
        
    class Localisation_Network(Layer):
      def __init__(self):
        super(Localisation_Network, self).__init__()
        self.conv = Conv2D(4,(3, 3), padding = "valid", strides=2, activation="relu", kernel_initializer="he_normal")
        self.flatten = Flatten()
        self.dense_1 = Dense(64, activation="relu", kernel_initializer="he_normal")
        self.dense_2 = Dense(6, activation="linear")
        self.reshape = Reshape((2, 3))

      def call(self, input_tensor):
        x = self.conv(input_tensor)
        x = self.flatten(x)
        x = self.dense_1(x)
        x = self.dense_2(x)
        x = self.reshape(x)
        return x    

    def get_model():
      x_input = Input((28, 28, 1))
      u = Conv2D(16, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x_input)
      u = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(u)
      theta = Localisation_Network()(u)
      v = SpatialTransformer()(u, theta)
      v = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(v)
      x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(v)
      x = GlobalAveragePooling2D()(x)
      x = Flatten()(x)
      x = Dense(10,activation ="softmax")(x)
      model =  Model(inputs = x_input, outputs = x)
      return model

Error of the above code:

    ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
    <ipython-input-47-d630585afd1d> in <module>()
          4 u = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(u)
          5 theta = Localisation_Network()(u)
    ----> 6 v = SpatialTransformer()(u, theta)
          7 v = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(v)
          8 x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(v)

                                          4 frames
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
        668       except Exception as e:  # pylint:disable=broad-except
        669         if hasattr(e, 'ag_error_metadata'):
    --> 670           raise e.ag_error_metadata.to_exception(e)
        671         else:
        672           raise

    ValueError: in user code:

        <ipython-input-7-910b0adb6eb7>:83 call  *
            batch_grids = self.affine_transform([H, W], theta)
        <ipython-input-45-eb5ac5f8f722>:14 affine_transform  *
            sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper  **
            return target(*args, **kwargs)
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py:1405 stack
            value_shape = ops.convert_to_tensor(values[0], name=name)._shape_tuple()  # pylint: disable=protected-access
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/profiler/trace.py:163 wrapped
            return func(*args, **kwargs)
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1540 convert_to_tensor
            ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:339 _constant_tensor_conversion_function
            return constant(v, dtype=dtype, name=name)
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:265 constant
    allow_broadcast=True)
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:283 _constant_impl
    allow_broadcast=allow_broadcast))
        /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/tensor_util.py:445 make_tensor_proto
            raise ValueError("None values not supported.")

        ValueError: None values not supported.
2
can you share stacktrace of error you're getting? - draganstankovic
ok, i'll update the post - Himanshu
How can i account for the Batch value in transformer layer while making it possible to pass different batchsize during test time. - Himanshu

2 Answers

0
votes

It is hard to tell from here but based on stacktrace seems like this line is problematic - sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1])) (forwards None where is not expected).

2nd thing I have noticed - not sure if your call method override in SpatialTransformer should actually have 3 params def call(self, feature_map, theta, out_size = None): ? Seems like since it inherits from Layer it should have input_tensor param only.

Not sure also if you need to override build for your use case and perhaps do the initializations required there.

Other than that you can try to extensively log (add print statements) and see where exactly None value 'enters'.

Finally, you can also upload an excerpt of your code sufficient to reproduce the same error and that could perhaps bring more help.

0
votes

I have removed the tf.tile layer as the vectorized output of localisation network having dimension (None, 2, 3) will do the vectorization trick during tf.matmul operation. I have also replaced tf.reshape operation with predifined keras reshape layer tf.keras.layers.Reshape() for every reshape operation as they maintain the vectorization.

class SpatialTransformer(Layer):
  def __init__(self, out_size, name= "spatial_transformer"):
    super(SpatialTransformer, self).__init__()
    self.out_size = out_size
    self.reshape_1 = Reshape([2, self.out_size[0], self.out_size[1]])   #for replacing all the reshape to vectorized form
    self.reshape_2 = Reshape([self.out_size[0], self.out_size[1]])
    self.reshape_3 = Reshape([1, 1])
    self.reshape_4 = Reshape([])


  def affine_transform(self, input_shape, theta):
    N = theta.shape[0]
    H, W = input_shape  #output dimensions of grid
    x_t, y_t = tf.meshgrid(tf.linspace(-1, 1, W), tf.linspace(-1, 1, H))
    x_t = tf.cast(tf.reshape(x_t, [-1]), dtype = tf.float32)
    y_t = tf.cast(tf.reshape(y_t, [-1]), dtype = tf.float32)
    ones = tf.ones(x_t.shape, dtype=tf.float32)
    sampling_grids = tf.stack([x_t, y_t, ones])
    # sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
    batch_grids = tf.matmul(theta, sampling_grids)
    batch_grids = self.reshape_1(batch_grids)
    return batch_grids

  def get_pixel_value(self, feature_map, x_s, y_s):
    "Util Function to get the value of pixel from 4d image tensors given position vectors x_s and y_s"
    N, H, W = x_s.shape
    batch_idx = tf.range(0, N)
    batch_idx = self.reshape_3(batch_idx)
    b = tf.tile(batch_idx, (1, H, W))
    indices = tf.stack([b, y_s, x_s], 3)   #creating indices of shape(N, H, W)
    return tf.gather_nd(feature_map, indices)   #extracting values corresponding to those indices



  def bilinear_sampler(self, feature_map, x, y):
    N, H, W, _ = feature_map.shape
   
    max_y = tf.cast(H - 1, dtype = tf.int32)
    max_x = tf.cast(W - 1, dtype = tf.int32)
    zero = tf.zeros([], dtype= tf.int32)

    x = tf.cast(x, dtype = tf.float32)
    y = tf.cast(y, dtype = tf.float32)    
   
    #Reshaping the batch grid from [-1, 1] to [0, W-1] and [0, H-1]
    x = (x + 1.0) * tf.cast(max_x, dtype = tf.float32)/2.0
    y = (y + 1.0) * tf.cast(max_y, dtype = tf.float32)/2.0
   
    
    #Taking the 4 nearest points to the (x_i, y_i) to perform interpolation
    x0 = tf.cast(tf.floor(x), dtype=tf.int32)
    x1 = x0 + 1
    y0 = tf.cast(tf.floor(y), dtype = tf.int32)
    y1 = y0 + 1
    
    #clipping the value to be between [0, W-1] or [0, H-1]
    x0 = tf.clip_by_value(x0, zero, max_x)
    x1 = tf.clip_by_value(x1, zero, max_x)
    y0 = tf.clip_by_value(y0, zero, max_y)
    y1 = tf.clip_by_value(y1, zero, max_y)
    
    #getting pixel values of the corner coordinates(x0,y0), (x0, y1), (x1, y0), (x1, y1)
    Ia = self.get_pixel_value(feature_map, x0, y0)
    Ib = self.get_pixel_value(feature_map, x0, y1)
    Ic = self.get_pixel_value(feature_map, x1, y0)
    Id = self.get_pixel_value(feature_map, x1, y1)
    # print(f"Ia: {Ia}")

    #Changing the data type to float32
    x0 = tf.cast(x0, dtype = tf.float32)
    x1 = tf.cast(x1, dtype = tf.float32)
    y0 = tf.cast(y0, dtype = tf.float32)
    y1 = tf.cast(y1, dtype = tf.float32)

    #calculating delta (or simply area) weights for interpolation
    Wa = tf.expand_dims((x1-x)*(y1-y), axis=3)
    Wb = tf.expand_dims((x1-x)*(y-y0), axis=3)
    Wc = tf.expand_dims((x-x0)*(y1-y), axis=3)
    Wd = tf.expand_dims((x-x0)*(y-y0), axis=3)
    out = tf.add_n([Wa*Ia, Wb*Ib, Wc*Ic, Wd*Id])
    return out

  def call(self, input_tensor):
    feature_map, theta = input_tensor
    N, H, W, _ = feature_map.shape

    if self.out_size:
      out_H = self.out_size[0]
      out_W = self.out_size[1]
      batch_grids = self.affine_transform([out_H, out_W], theta)
    else:
      batch_grids = self.affine_transform([H, W], theta)
    
    x_s = self.reshape_2(batch_grids[:,0,:,:])
    y_s = self.reshape_2(batch_grids[:,1,:,:])
    
    output_feature_map = self.bilinear_sampler(feature_map, x_s, y_s)
    return output_feature_map
class Localisation_Network(Layer):
  def __init__(self):
    super(Localisation_Network, self).__init__()
    self.conv_1 = Conv2D(16, (3, 3), padding = "same", strides=1, activation="relu", kernel_initializer="he_normal")
    self.conv_2 = Conv2D(32, (3, 3), padding = "same", strides=1, activation="relu", kernel_initializer="he_normal")
    self.flatten = Flatten()
    self.dense_1 = Dense(32, activation="relu", kernel_initializer="he_normal")
    
    def bias_init(shape, dtype = None):
      identitiy = tf.Variable([[1.0, 0.0, 0.0],[0.0, 1.0, 0.0]])
      identitiy = tf.reshape(identitiy, -1)
      return identitiy
  
    self.dense_2 = Dense(6,kernel_initializer = "zeros", bias_initializer = bias_init)
    self.reshape = Reshape((2, 3))

  
  def call(self, input_tensor):
    x = self.conv_1(input_tensor)
    x = self.conv_2(x)
    x = tf.reduce_mean(x, axis = [1, 2])
    x = self.dense_1(x)
    x = self.dense_2(x)
    x = self.reshape(x)
    return x


def transformer_model_2():
  x_input = Input((28, 28, 1))
  theta = Localisation_Network()(x_input)
  x = SpatialTransformer(x_input.shape[1:3], name = "transformer_output" )([x_input, theta])
  x = Conv2D(16, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x)
  x = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(x)
  x = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x)
  x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(x)
  x = GlobalAveragePooling2D()(x)
  x = Flatten()(x)
  x = Dense(10,activation ="softmax")(x)
  return Model(inputs = x_input, outputs = x)

The only thing I am stuck on is localization network, as it is a regression network so linear activation is placed but the output of this network causes the value to be big and so clipped later in bilinear sampling which ultimately results in zero output and hence gradients are not able to flow through localization network.

I have looked up medium post and github to find the solution, many of them suggested to initialize the weights to zeros and biases to identity: [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]] in the last layer of the localization network but it's not working.