Using OpenCV Cuda ORB feature detector

Question

I have an application where I am receiving a stream of images within which I want to monitor detected features within a set ROI. This is accomplished using an ORB detector. In the first image, I use the detector to find "reference" keypoints and descriptors for the given ROI. For subsequent images, I find "test" keypoints and descriptors for the same ROI. I then use a knn matcher to find matches between to reference and test descriptors. Finally, I attempt to find the "best" matches, add the associated keypoints to a "matched keypoints" collection, and then calculate a "match intensity". This match intensity is intended to indicate how well the keypoints found in the reference image match the keypoints in the test image.

I have a few questions:

1 - is this a valid use of a feature detector? I understand that a simple template matching might give me similar results, but I was hoping to avoid issues with slight changes in lighting.

2 - am I screen my matches properly for "good" matches, and then am I getting the correctly associated keypoint for that match?

3 - my code seems to work as is, however, if I try move to the async versions of the OpenCV calls using streams, I get an exception: "invalid resource handle in function cv::cuda::GpuMat::setTo" which happens in a call to ORB_Impl::buildScalePyramids (that was called from ORB_Impl::detectAndComputeAsync). See the async version of my "NewFrame" function below. This just makes me think I'm not getting all of this set up properly.

Here is my code:

void Matcher::Matcher()
{
    // create ORB detector and descriptor matcher
    m_b = cuda::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20, true);   
    m_descriptorMatcher =       cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING); 
}

void Matcher::Configure(int imageWidth, int imageHeight, int roiX, int roiY, int roiW, int roiH)
{
    // set member variables
    m_imageWidth = imageWidth;
    m_imageHeight = imageHeight;
    m_roiX = roiX;
    m_roiY = roiY;
    m_roiW = roiW;
    m_roiH = roiH;

    m_GpuRefSet = false; // set flag indicating reference not yet set

    // create mask for specified ROI
    m_mask = GpuMat(imageHeight,imageWidth, CV_8UC1, Scalar::all(0));
    cv::Rect rect = cv::Rect(m_roiX, m_roiY, m_roiW, m_roiH);
    m_mask(rect).setTo(Scalar::all(255));       
}


double Matcher::NewFrame(void *pImagedata)
{
    // pImagedata = pointer to BGRA byte array
    // m_imageHeight and m_imageWidth have already been set
    // m_b is a pointer to the ORB detector

    if (!m_GpuRefSet)
    { // 1st time through (after call to Matcher::Configure), set reference keypoints and descriptors

        cv::cuda::GpuMat mat1(m_imageHeight, m_imageWidth, CV_8UC4, pImagedata);  // put image data into GpuMat

        cv::cuda::cvtColor(mat1, m_refImage, CV_BGRA2GRAY); // convert to grayscale as required by ORB

        m_keyRef.clear(); // clear the vector<KeyPoint>, keypoint vector for reference image

        m_b->detectAndCompute(m_refImage, m_mask, m_keyRef, m_descRef, false); // detect keypoints and compute descriptors

        m_GpuRefSet = true;     
    }

    cv::cuda::GpuMat mat2(m_imageHeight, m_imageWidth, CV_8UC4, pImagedata);  // put image data into GpuMat

    cv::cuda::cvtColor(mat2, m_testImage, CV_BGRA2GRAY, 0);  // convert to grayscale as required by ORB

    m_keyTest.clear(); // clear vector<KeyPoint>, keypoint vector for test image

    m_b->detectAndCompute(m_testImage, m_mask, m_keyTest, m_descTest, false);  // detect keypoints and compute descriptors


    double value = 0.0f;  // used to store return value ("match intensity")

        // calculate best match for each descriptor
        if (m_descTest.rows > 0)
        {   
            m_goodKeypoints.clear(); // clear vector of "good" KeyPoints, vector<KeyPoint> 

            m_descriptorMatcher->knnMatch(m_descTest, m_descRef, m_matches, 2, noArray());  // find matches

            // examine all matches, and collect the KeyPoints whose match distance mets given criteria
            for (int i = 0; i<m_matches.size(); i++){
                if (m_matches[i][0].distance < m_matches[i][1].distance * m_nnr){ // m_nnr = nearest neighbor ratio (typically 0.6 - 0.8)
                    m_goodKeypoints.push_back(m_keyRef.at(m_matches[i][0].trainIdx));  // not sure if getting the correct keypoint here
                }
            }

            // calculate "match intensity", i.e. percent of the keypoints found in the reference image that are also in the test image
            value = ((double)m_goodKeypoints.size()) / ((double)m_keyRef.size());
        }
        else
        {
            value = 0.0f;
        }

    return value;
}

And here's the stream/async version of the NewFrame function that fails:

double Matcher::NewFrame(void *pImagedata)
{
    if (m_b.empty()) return 0.0f;

    if (!m_GpuRefSet)
    {
        try
        {
            cv::cuda::GpuMat mat1(m_imageHeight, m_imageWidth, CV_8UC4, pImagedata);

            cv::cuda::cvtColor(mat1, m_refImage, CV_BGRA2GRAY);

            m_keyRef.clear();

            m_b->detectAndComputeAsync(m_refImage, m_mask, m_keyRef, m_descRef, false,m_stream);  // FAILS HERE

            m_stream.waitForCompletion();

            m_GpuRefSet = true;
        }
        catch (Exception e)
        {
            string msg = e.msg;
        }

    }

    cv::cuda::GpuMat mat2(m_imageHeight, m_imageWidth, CV_8UC4, pImagedata);

    cv::cuda::cvtColor(mat2, m_testImage, CV_BGRA2GRAY, 0, m_stream);

    m_keyTest.clear();

    m_b->detectAndComputeAsync(m_testImage, m_mask, m_keyTest, m_descTest, false, m_stream);

    m_stream.waitForCompletion();

    double value = 0.0f;

    // calculate best match for each descriptor

    if (m_descTest.rows > 0)
    {
        m_goodKeypoints.clear();
        m_descriptorMatcher->knnMatchAsync(m_descTest, m_descRef, m_matches, 2, noArray(), m_stream);

        m_stream.waitForCompletion(); 

        for (int i = 0; i<m_matches.size(); i++){
            if (m_matches[i][0].distance < m_matches[i][1].distance * m_nnr) // m_nnr = nearest neighbor ratio
            {
                m_goodKeypoints.push_back(m_keyRef.at(m_matches[i][0].trainIdx));
            }
        }

        value = ((double)m_goodKeypoints.size()) / ((double)m_keyRef.size());
    }
    else
    {
        value = 0.0f;
    }


    if (value > 1.0f) value = 1.0f;

    return value;
}

Any suggestions/advice would be appreciated.

Thanks!!

Bryan Greenway Bryan Greenway · Accepted Answer · 2017-11-10T18:02:07

After some trials, I'm convinced that this is indeed a reasonable use of the ORB detector and that my test for "goodness" using the Nearest-Neighbor Ratio approach also seems to work. This answers question #1 and #2 above.

As related to question #3, I did make some discoveries that cleaned things up substantially for me.

First, it turns out that I was not being careful enough with cv::cuda::Stream and cpu threads. Although I'm sure it's obvious to many, and is mentioned in OpenCV docs, anything put on a specific cv::cuda::Stream should be done so from the same cpu thread. Not doing so doesn't necessarily create an exception, but will create undetermined behavior which may include exceptions.

Second, for me it turned out that using the Async versions of detectAndCompute and knnMatch were more reliable with multi-threading. This seems to be related to the fact that the Async version uses all GPU-based parameters, where the non-Async versions have a CPU-based vector parameter. Both Async and non-Async versions seem to work with I wrote simple, single-thread test apps. However, my real application has other CUDA kernels and the CUDA video decoder running on other threads, so things are crowded on the GPU.

Anyway, here's my version of how to make the Async function calls that cleaned everything up for me. It demonstrates the use of the Async/Stream versions of the ORB detector and descriptor matcher. The cv::cuda::Stream passed into it can be either cv::cuda::Stream::NullStream() or a cv::cuda::Stream that you create. Just remember to create the stream on the same cpu thread were it is used.

I'm still interested in suggestions for improvement, but the following seems to work.

orb = cuda::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20, true);   
matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);  

// process 1st image
GpuMat imgGray1;  // load this with your grayscale image
GpuMat keys1; // this holds the keys detected
GpuMat desc1; // this holds the descriptors for the detected keypoints
GpuMat mask1; // this holds any mask you may want to use, or can be replace by noArray() in the call below if no mask is needed
vector<KeyPoint> cpuKeys1;  // holds keypoints downloaded from gpu

//ADD CODE TO LOAD imgGray1

orb->detectAndComputeAsync(imgGray1, mask1, keys1, desc1, false, m_stream);
stream.waitForCompletion();
orb->convert(keys1, cpuKeys1); // download keys to cpu if needed for anything...like displaying or whatever

// process 2nd image
GpuMat imgGray2;  // load this with your grayscale image
GpuMat keys2; // this holds the keys detected
GpuMat desc2; // this holds the descriptors for the detected keypoints
GpuMat mask2; // this holds any mask you may want to use, or can be replace by noArray() in the call below if no mask is needed
vector<KeyPoint> cpuKeys2;  // holds keypoints downloaded from gpu

//ADD CODE TO LOAD imgGray2

orb->detectAndComputeAsync(imgGray2, mask2, keys2, desc2, false, m_stream);
stream.waitForCompletion();
orb->convert(keys2, cpuKeys2); // download keys to cpu if needed for anything...like displaying or whatever

if (desc2.rows > 0)
{
    vector<vector<DMatch>> cpuKnnMatches;
    GpuMat gpuKnnMatches;  // holds matches on gpu
    matcher->knnMatchAsync(desc2, desc1, gpuKnnMatches, 2, noArray(), *stream);  // find matches
    stream.waitForCompletion();

    matcher->knnMatchConvert(gpuKnnMatches, cpuKnnMatches); // download matches from gpu and put into vector<vector<DMatch>> form on cpu

    vector<DMatch> matches;         // vector of good matches between tested images

    for (std::vector<std::vector<cv::DMatch> >::const_iterator it = cpuKnnMatches.begin(); it != cpuKnnMatches.end(); ++it) {
                if (it->size() > 1 && (*it)[0].distance / (*it)[1].distance < m_nnr) {  // use Nearest-Neighbor Ratio to determine "good" matches
            DMatch m = (*it)[0];
            matches.push_back(m);       // save good matches here                           

                }
            }

        }
}

Using OpenCV Cuda ORB feature detector

1 Answers