2
votes

I am using OpenMP to parallel the following function

float myfunc ( Class1 *class1, int *feas, int numfeas, float z, long *k, double cost, long iter, float e )
{
    long i;
    long x;
    double sum;

    sum = cost;
    while ( change/cost > 1.0*e ) {
        change = 0.0;

        intshuffle ( feas, numfeas );

        #pragma omp parallel for private (i,x) firstprivate(z,k) reduction (+:sum)
        for ( i=0; i<iter; i++ ) {
            x = i%numfeas;
            sum += pgain ( feas[x], class1, z, k );
        }
        cost -= sum;
    }
    return ( cost );
}

This returns a "Segmentation fault (core dumped)" error

When I change my pragma to

#pragma omp parallel for private (i,x) firstprivate(class1,feas,z,k) reduction (+:sum)`

I get a "double free or corruption (out): 0xb6a00468" error.

From my limited knowledge of OpenMp i understand this is due to faulty memory access to pointers concerning class1 and feas.

For reference I post also my Class1 code

typedef struct {
    float w;
    float *c;
    long a;  
    float co;  
} Class1;

Please advice on the proper way to parallelise the above function.

Update:

The pgain function

double pgain ( long x, Class1 *class1, double z, long int *numcenters )
{
    int i;
    int number_of_centers_to_close = 0;

    static double *work_mem;
    static double gl_cost_of_opening_x;
    static int gl_number_of_centers_to_close;

    int stride = *numcenters + 2;
    //make stride a multiple of CACHE_LINE
    int cl = CACHE_LINE/sizeof ( double );
    if ( stride % cl != 0 ) {
        stride = cl * ( stride / cl + 1 );
    }
    int K = stride - 2 ; // K==*numcenters

    //my own cost of opening x
    double cost_of_opening_x = 0;

    work_mem = ( double* ) malloc ( 2 * stride * sizeof ( double ) );
    gl_cost_of_opening_x = 0;
    gl_number_of_centers_to_close = 0;

    /*
     * For each center, we have a *lower* field that indicates
     * how much we will save by closing the center.
     */
    int count = 0;
    for ( int i = 0; i < class1->num; i++ ) {
        if ( is_center[i] ) {
            center_table[i] = count++;
        }
    }
    work_mem[0] = 0;

    //now we finish building the table. clear the working memory.
    memset ( switch_membership, 0, class1->num * sizeof ( bool ) );
    memset ( work_mem, 0, stride*sizeof ( double ) );
    memset ( work_mem+stride,0,stride*sizeof ( double ) );

    //my *lower* fields
    double* lower = &work_mem[0];
    //global *lower* fields
    double* gl_lower = &work_mem[stride];

    #pragma omp parallel for 
    for ( i = 0; i < class1->num; i++ ) {
        float x_cost = dist ( class1->p[i], class1->p[x], class1->dim ) * class1->p[i].weight;
        float current_cost = class1->p[i].cost;

        if ( x_cost < current_cost ) {

            // point i would save cost just by switching to x
            // (note that i cannot be a median,
            // or else dist(p[i], p[x]) would be 0)

            switch_membership[i] = 1;
            cost_of_opening_x += x_cost - current_cost;

        } else {

            // cost of assigning i to x is at least current assignment cost of i

            // consider the savings that i's **current** median would realize
            // if we reassigned that median and all its members to x;
            // note we've already accounted for the fact that the median
            // would save z by closing; now we have to subtract from the savings
            // the extra cost of reassigning that median and its members
            int assign = class1->p[i].assign;
            lower[center_table[assign]] += current_cost - x_cost;
        }
    }

    // at this time, we can calculate the cost of opening a center
    // at x; if it is negative, we'll go through with opening it

    for ( int i = 0; i < class1->num; i++ ) {
        if ( is_center[i] ) {
            double low = z + work_mem[center_table[i]];
            gl_lower[center_table[i]] = low;
            if ( low > 0 ) {
                // i is a median, and
                // if we were to open x (which we still may not) we'd close i

                // note, we'll ignore the following quantity unless we do open x
                ++number_of_centers_to_close;
                cost_of_opening_x -= low;
            }
        }
    }
    //use the rest of working memory to store the following
    work_mem[K] = number_of_centers_to_close;
    work_mem[K+1] = cost_of_opening_x;

    gl_number_of_centers_to_close = ( int ) work_mem[K];
    gl_cost_of_opening_x = z + work_mem[K+1];

    // Now, check whether opening x would save cost; if so, do it, and
    // otherwise do nothing

    if ( gl_cost_of_opening_x < 0 ) {
        //  we'd save money by opening x; we'll do it
        #pragma omp parallel for
        for ( int i = 0; i < class1->num; i++ ) {
            bool close_center = gl_lower[center_table[class1->p[i].assign]] > 0 ;
            if ( switch_membership[i] || close_center ) {
                // Either i's median (which may be i itself) is closing,
                // or i is closer to x than to its current median
                #pragma omp critical
                {
                class1->p[i].cost = class1->p[i].weight * dist ( class1->p[i], class1->p[x], class1->dim );
                class1->p[i].assign = x;
                }
            }
        }
        for ( int i = 0; i < class1->num; i++ ) {
            if ( is_center[i] && gl_lower[center_table[i]] > 0 ) {
                is_center[i] = false;
            }
        }
        if ( x >= 0 && x < class1->num ) {
            is_center[x] = true;
        }

        *numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
    } else {
        gl_cost_of_opening_x = 0;  // the value we'll return
    }

    free ( work_mem );

    return -gl_cost_of_opening_x;
}

Update 2

Fixed pgain cause on my version i have #pragma directives also in pgain

And here is a stack trace

#0  0xb7d50750 in ?? () from /lib/i386-linux-gnu/libc.so.6
#1  0xb7eaa198 in ?? () from /usr/lib/i386-linux-gnu/libgomp.so.1
#2  0x080493a0 in pgain (x=2982, class1=0xbffff218, z=1461.919921875, 
    numce=0x804d128) at streamcluster-openmp.cpp:232
#3  0x0804aaf6 in myfunc(Class1*, int*, int, float, long*, double, long, float) [clone ._omp_fn.0] () at openmp.cpp:347
#4  0xb7ea9889 in ?? () from /usr/lib/i386-linux-gnu/libgomp.so.1
#5  0xb7cbcd4c in start_thread () from /lib/i386-linux-gnu/libpthread.so.0
#6  0xb7dc9bae in clone () from /lib/i386-linux-gnu/libc.so.6
1
Are you sure that i has to be private? I'm pretty sure you can leave out that directive, and inline x. I need to look back at my OpenMP notes... - millinon
@Mysticial pgain is another function the signature is double pgain ( long x, Class1 *class1, double z, long int *numcent ). But it doesnt matter cause the code compiles and runs correctly without the OpenMP pragmas - Alexander Talavari
@millinon i doesnt need to be private cause OpenMp is smart enough to recognise it as private but x need to be private declared. The problem is with shared memory probably and the ponters of feas and class1. - Alexander Talavari
Are you by any chance allocating memory inside that function? Is the memory allocator thread-safe? - Mysticial
@Mysticial yes i am running a malloc inside pgain - Alexander Talavari

1 Answers

3
votes

The problem may be a possible race-condition in this part of pgain:

// Either i's median (which may be i itself) is closing,
// or i is closer to x than to its current median
class1->p[i].cost = class1->p[i].weight * dist ( class1->p[i], class1->p[x], class1->dim );
class1->p[i].assign = x;

As class1 is a pointer, what you make private with the statement firstprivate(class1) is the bare pointer, not the underlying resource.

The solution to this issue highly depends on the semantic of your program. If it is fine to have randomly ordered updates of *class1, and the resource is really meant to be shared, then the following modification:

// Either i's median (which may be i itself) is closing,
// or i is closer to x than to its current median
#pragma omp critical LOCK_CLASS1
{
  // Lock this part of the code for thread-safety
  class1->p[i].cost = class1->p[i].weight * dist ( class1->p[i], class1->p[x], class1->dim );
  class1->p[i].assign = x
}

will fix the code above. Otherwise you should create private copies of *class1 for each thread before calling pgain.

Finally, you should note that the above reasoning remains true for any resource. For instance, the pointer float *c in Class1 shows the same critical point if it refers to a resource that is shared and you don't synchronize memory updates.