
I have trained a TensorFlow model (SSD MobileNet v2) on a custom dataset containing two classes. When I use the model to perform inferencing I get reasonable accuracy. However, the bounding boxes are often off (see example images). Is there anything I can do to determine what is causing this issue? Perhaps something is amiss with the NMS being used to determine the correct box to return form the model?

The code I'm using to performing inference on image frames:

from typing import Dict, List

import numpy as np
import tensorflow as tf

class ObjectDetectorTensorFlow:

    def __init__(
            labelmap: str,
            frozen_inference_graph: str,
        Constructor function.

        :param labelmap: path to TFRecord labels map prototext file
        :param frozen_inference_graph: path to frozen inference graph protobuf file

        # load the label map
        self.categories = self._parse_label_map(labelmap)

        # Load the TensorFlow model into memory
        detection_graph = tf.Graph()
        with detection_graph.as_default():
            od_graph_def = tf.GraphDef()
            with tf.gfile.GFile(frozen_inference_graph, 'rb') as fid:
                serialized_graph = fid.read()
                tf.import_graph_def(od_graph_def, name='')

            self.tf_session = tf.Session(graph=detection_graph)

        # the image will act as the input tensor
        self.image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')

        # detection boxes, scores, number of objects
        # detected, and classes will be the output tensors
        self.detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
        self.detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
        self.detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
        self.num_detections = detection_graph.get_tensor_by_name('num_detections:0')

    def detect(
            frame: np.ndarray,
            confidence: float,
    ) -> List[Dict]:
        Get object detections from an image frame.

        :param numpy.ndarray frame: BGR image data array with shape
            (height, width, 3), with values in range (0..255), and dtype=uint8
        :param float confidence: minimum detection confidence (probability),
            used to filter weak detections
        :return: list of detection dictionaries, with each dictionary containing
            items "label", "probability", and "bounding_box"

        # expand frame dimensions to have shape: [1, None, None, 3]
        # i.e. a single-column array, where each item in the column
        # has the pixel BGR value
        frame_expanded = np.expand_dims(frame, axis=0)

        # perform object detection by running the model with the image as input
        (boxes, scores, classes, _) = \
                feed_dict={self.image_tensor: frame_expanded},

        # iterate over the detections, adding each to a list we'll return
        detections = []
        boxes = np.squeeze(boxes)
        scores = np.squeeze(scores)
        classes = np.squeeze(classes)
        for box, score, detection_class in zip(boxes, scores, classes):

            # if the probability score meets the confidence threshold
            # then add a detection to the list we'll return
            if score >= confidence:

                # create a dictionary for the detection and add to the list
                detection = {
                    "label": self.categories[int(detection_class)],
                    "probability": score,
                    "bounding_box": box,

        return detections

    def _parse_label_map(
            labels_map: str,
    ) -> Dict:
        Parses a labels map prototext file into a dictionary mapping class IDs
        to labels.

        :param labels_map: path to TFRecord labels map file
        :return: dictionary mapping class IDs to labels

        categories = {}
        with open(labels_map) as label_map:

            for line in label_map:
                line = line.strip()
                if line.startswith("id"):
                    id_ = int(line.split(sep=":")[1])
                    name_line = label_map.readline().strip()
                    if name_line.startswith("name"):
                        categories[id_] = name_line.split(sep=":")[1].strip().strip('\'')
                        raise ValueError("ID line not followed by name line")

        return categories

Example images with bounding boxes that are off:

enter image description here enter image description here

May it be that you swapped X and Y coordinates when drawing your boxes ?Théo Rubenach
Thanks, yes I've tried that already, my assumption is that the coordinates are coming from the inference/detection as start_x, start_y, end_x, end_y (top left, bottom right). This may be a misassumption and I will try other arrangements to see if any others work better (for example maybe the model instead provides boxes for detections in (x1, x2, y1, y2) order).James Adams

1 Answers


This turned out to be a mix up in the order of the bounding box coordinates reported from the model -- I was assuming an order of (x0, y0, x1, y1) and instead the order is (y0, x0, y1, x1). The fix for the above code is this:

        # compute the detected object's bounding box (x, y)-coordinates
        # NOTE bounding boxes are in order (y0, x0, y1, x1)
        box = object_detection["bounding_box"] * np.array([height, width, height, width])
        (start_y, start_x, end_y, end_x) = box.astype("int")