0
votes

I am using a CFD code, it is based on Opnempi. When I use only one core to run it, there are no problems. But when I use more cores to run it, I got errors like this:

[DESKTOP-7D2F3AN:03839] Process received signal

[DESKTOP-7D2F3AN:03839] Signal: Segmentation fault (11)

[DESKTOP-7D2F3AN:03839] Signal code: (128)

[DESKTOP-7D2F3AN:03839] Failing at address: (nil)

[DESKTOP-7D2F3AN:03839] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x12890) [0x7f405efe2890]

[DESKTOP-7D2F3AN:03839] [ 1] /lib/x86_64-linux-gnu/libc.so.6(+0x18ec3c) [0x7f405ed5ec3c]

[DESKTOP-7D2F3AN:03839] [ 2] /usr/local/lib/libmpi.so.0(ompi_convertor_pack+0x196) [0x7f405f462176]

[DESKTOP-7D2F3AN:03839] [ 3] /usr/local/lib/openmpi/mca_pml_ob1.so(+0x10851) [0x7f405b780851]

[DESKTOP-7D2F3AN:03839] [ 4] /usr/local/lib/openmpi/mca_pml_ob1.so(+0x53ee) [0x7f405b7753ee]

[DESKTOP-7D2F3AN:03839] [ 5] /usr/local/lib/openmpi/mca_coll_tuned.so(+0xb2ff) [0x7f4059e2b2ff]

[DESKTOP-7D2F3AN:03839] [ 6] /usr/local/lib/openmpi/mca_coll_tuned.so(+0xbaf1) [0x7f4059e2baf1]

[DESKTOP-7D2F3AN:03839] [ 7] /usr/local/lib/openmpi/mca_coll_tuned.so(+0x20ab) [0x7f4059e220ab]

[DESKTOP-7D2F3AN:03839] [ 8] /usr/local/lib/openmpi/mca_coll_sync.so(+0x1377) [0x7f405a041377]

[DESKTOP-7D2F3AN:03839] [ 9] /usr/local/lib/libmpi.so.0(MPI_Bcast+0x11d) [0x7f405f46974d]

[DESKTOP-7D2F3AN:03839] [10] ./cgles(com_dist_param+0x4b) [0x7f4060236972]

[DESKTOP-7D2F3AN:03839] [11] ./cgles(main+0x377) [0x7f4060233377]

[DESKTOP-7D2F3AN:03839] [12] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f405ebf1b97]

[DESKTOP-7D2F3AN:03839] [13] ./cgles(_start+0x2a) [0x7f406022254a]

[DESKTOP-7D2F3AN:03839] End of error message

mpirun noticed that process rank 0 with PID 3839 on node DESKTOP-7D2F3AN exited on signal 11 (Segmentation fault).

I don't know why. The code is too long. The main part is like this:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>

#include "param.h"
#include "block.h"
#include "q.h"
#include "map.h"
#include "comms.h"
#include "util.h"
#include "dat.h"
#include "timing.h"
#include "sgm.h"
#include "error.h"
#include "init.h"
#include "mom.h"
#include "inst.h"
#include "pcg.h"
#include "bicgstab.h"
#include "ibm.h"
#include "hist.h"
#include "pcg_ext.h"
#include "dibm.h"
/* using BLAS library for better performance */
#ifdef BLAS 
#include <essl.h>
#endif

/* local prototypes */
void cmdopts(int *, char ***);
void mybasename(char *, char *);
void glob_output(void);
void glob_tstep(void);
void getstep(char *fname, double *Prtime, int *Pitime);
void putstep(char *fname, double *Prtime, int *Pitime);


/* external prototypes */
void slice_probe(void);
void print_solution(char* , int);
void print_vert(char*, int);
void print_wave(char*, int);
void print_moments(char*, int);

extern void InitializeIBPoint();
/*void glob_snapshot(char*);*/
extern void dibm_interploate_shear_velocity(void);


/* global, used by mgp.c, pcg.c and map.c */
/* global, used by mgp.c, pcg.c and map.c */
int mypid = -1;     /* my process id number */
int npid = 1;       /* total process number */
int halt_flg = 0;       /* halt t-steps, 1=stop, 0=cont */
double wck_t_start, wck_t_end;  /* wall clock timing */
double cpu_t_start, cpu_t_end;  /* cpu clock timing */
int time_limit = 999999999; /* wall clock limit (seconds) */
double Y_pro_mu=0; //Mu in Y-code;
double Y_pro_Laimuda=0;//Laimuda in Y-code
//char Y_y3dfile[256];  //*.y3d filename in Y-code
double Y_adheforcefactor=1.0;//force factor applied to ib points on Y solid, used in Yw.c-F2y()
double Y_shear_force_factor =1.0;
double Y_shear_vel = 0.0;
int Y_init_entity_num; //cell number at initilization of Y
//#Additional Misc by XuDong
int stat_y_save_dn; //  1000         #y solid saving interval
int stat_plane_save_dn;//   1000    #plane saving interval
int stat_glob_fluid_save_dn;//  10000000  #global fluid saving interval
int stat_restart_save_dn; //  1000   #Restart file saving interval
int stat_gtk_draw_dn; //  1          #gtkdraw interval when GTK defined
int stat_gtk_save_dn; //  100        #gtkdraw picture saving interval when GTK defined
int stat_y_typic_save_dn;
double case_parameter_1;//undefined reserved case parameter
double case_parameter_2;
double case_parameter_3;
double case_parameter_4;

/* main: driver routine */
int main(int argc, char **argv)
{
  int bid, ibid;
  /* switches */
  int add_seed_flg = 0, probe_flg = 0, putxd_2x_flg = 0, reset_stats_flg = 0;   

  /* Start up any other nodes. The cmd line options relating to the
   * parallel interface are parsed and pruned here. */
  mypid = com_start(&argc, &argv, 1);
  npid  = com_nnodes();
  wck_t_start = WCKseconds();
  cpu_t_start = CPUseconds();

  /* Parse the remaining options [first node only] */
  if (mypid == 0)
    cmdopts(&argc, &argv);

  /* Buffering mode for stdout; needed on T3D */
  fflush(stdout);
  setvbuf(stdout, (char *) 0, _IOLBF, 0); /* line buffering */

  /* Read configuration data on ONE node */
  if (mypid == 0) 
  {
    /* get the map data and mesh points */
    map_getfn(fnames.map, fnames.msh);
    map_print(BID_NULL);
    if (map_checkt() || map_checku())
      exit(2);

    /* debugging */
    /* map_putfn("temp.map"); */
    /* msh_putfn("temp.msh"); */

    /* read the local data file and build lookup table */
    dat_read(fnames.dat);
    dat_print();

    /* Modify parameters using info from *.dat file */
    dat_getdouble(&viscm, "viscm");
    dat_getdouble(&fbody_x, "fbody_x");
    dat_getdouble(&fbody_y, "fbody_y");
    dat_getdouble(&fbody_z, "fbody_z");
    dat_getint(&time_limit, "time_limit");

    //#Additional Misc by XuDong
    dat_getdouble(&crk_beta,"crk_beta");// #=0--explicit, ==1 fully implicit ==1/2 crank-nicolson
    dat_getdouble(&ibm_relax,"ibm_relax");// #=0.5---normal for direct forcing

    dat_getint(&stat_y_save_dn,"stat_y_save_dn");//  1000         #y solid saving interval
    dat_getint(&stat_plane_save_dn,"stat_plane_save_dn");//   1000    #plane saving interval
    dat_getint(&stat_glob_fluid_save_dn,"stat_glob_fluid_save_dn");//  10000000  #global fluid saving interval
    dat_getint(&stat_restart_save_dn,"stat_restart_save_dn");//  1000   #Restart file saving interval
    dat_getint(&stat_gtk_draw_dn,"stat_gtk_draw_dn");//  1          #gtkdraw interval when GTK defined
    dat_getint(&stat_gtk_save_dn,"stat_gtk_save_dn");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getint(&stat_y_typic_save_dn,"stat_y_typic_save_dn");//  100        #gtkdraw picture saving interval when GTK defined


    dat_getdouble(&Y_adheforcefactor,"Y_adheforcefactor");//  1          #gtkdraw interval when GTK defined
    dat_getdouble(&Y_shear_force_factor,"Y_shear_force_factor");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getdouble(&Y_shear_vel,"Y_shear_vel");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getint(&Y_init_entity_num,"Y_init_entity_num");//  //cell number at initilization of Y

    dat_getdouble(&case_parameter_1,"case_parameter_1"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_2,"case_parameter_2"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_3,"case_parameter_3"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_4,"case_parameter_4"); //undefined reserved case parameter 
    /* set current time step parameters */
    dat_getint(&ntime, "ntime");
    dat_getdouble(&dt, "dt");
    if (!flg_init)
      getstep(fnames.xdi, &rtime_current, &itime);
    itime_first = itime;
    itime_last = itime_first + ntime;
    rtime_first = rtime_current;
    rtime_last = rtime_current + dt * ntime;

    /* print fluid parameters */
    prparam();
  }


    /* Distibute to other nodes */
    com_dist_param();   /* distribute file names */
    com_dist_map();   /* distribute map */
    com_dist_data();    /* distribute table */

I located where the error occured, it is in com_dist_param().

        void com_dist_param(void)
        {
            /* This is all there is to it, curtesy of derived datatypes */
            MPI_Bcast(&fnames, 1, fnames_type, 0, MPI_COMM_WORLD);
            /* params starts with address of fbody_x */
            MPI_Bcast(&fbody_x, 1, params_type, 0, MPI_COMM_WORLD);
        }

Errors occured when it came into MPI_Bcast. Fnames:

typedef struct
  {
    char xdi[MAXSTR];       /* xd in */
    char xdo[MAXSTR];       /* xd out */
    char dat[MAXSTR];       /* data */
    char map[MAXSTR];       /* domain map */
    char msh[MAXSTR];       /* mesh points */
    char y3d[MAXSTR];       /* Y3D input */
    char log[MAXSTR];       /* logging */
    char mom[MAXSTR];       /* stats moments */
  }
Fnames;

and it defined the fnames_type as MPI_Datatype:

MPI_Datatype
make_fnames_type(void)
{
    MPI_Datatype fnames_type;
    MPI_Datatype type[8];
    int blocklen[8];
    MPI_Aint disp[8];
    int base, i;
    /* define it */
    MPI_Address(&fnames.xdi, disp);
    MPI_Address(&fnames.xdo, disp + 1);
    MPI_Address(&fnames.dat, disp + 2);
    MPI_Address(&fnames.map, disp + 3);
    MPI_Address(&fnames.msh, disp + 4);
    MPI_Address(&fnames.y3d, disp + 5);
    MPI_Address(&fnames.log, disp + 6);
    MPI_Address(&fnames.mom, disp + 7);
    base = disp[0];
    for (i = 0; i < 8; i++) {
    type[i] = MPI_CHAR;
    blocklen[i] = MAXSTR;
    disp[i] -= base;
/*
   printf("disp[i] = %d blocklen[i] = %d\n", disp[i], blocklen[i]);
 */
    }

    /* create it */
    MPI_Type_struct(8, blocklen, disp, type, &fnames_type);
    MPI_Type_commit(&fnames_type);
    return (fnames_type);
}

Could anyone help me? Thanks a lot!

1
When you have a bug and the code is too long to find it, back up the code and start removing pieces until all that is left is the bug. Usually you con't have to remove everything, just enough to bring the noise down to a level where you can spot the bug through it.user4581301
Thanks! it is a good idea. I will do it.MeSure

1 Answers

1
votes

Assuming that, like most MPI users, you are running your code on a 64 bits machine:

MPI_AInt must be able to contain a pointer, hence have a size of 64 bits. In function make_fnames_type(), to substract the base address of the struct from the addresses of its various fields, you are storing the base address into an int variable, appropriately named base. But that int variable is a 32 bits quantity. You might be mutilating the pointer value.

The base address of your struct is a 64 bits quantity. You should store it only into another 64 bits type such as unsigned long int or uint64_t. The statement base = disp[0]; is likely to be a narrowing conversion and is any case not portable. You should check the sizeof values involved.

Side note: In C++, unlike in C you don't need to declare all function variables beforehand. You can declare variables as you need them. So for example there is no reason to declare int variable base 10 source lines before defining its value. So you can have int base = disp[0]; all in one place, and then the narrowing mishap is much easier to detect. Even better, you can just write: auto base = disp[0]; and then no narrowing can take place.

A possibility would be to change the loop like this:

    long int base = disp[0];
    for (i = 0; i < 8; i++) {
        type[i] = MPI_CHAR;
        blocklen[i] = MAXSTR;
        disp[i] -= base;
        /* changed %d into %ld below */
        printf("disp[i] = %ld  blocklen[i] = %d\n",
            (long int)disp[i], blocklen[i]);
    }

You can also check whether the memory size of the MPI custom type is what you expect thru the MPI_Type_Size() function.

General remark: all MPI functions in C/C++ return an integer error code. You skip checking these error codes at your peril. It is quite possible that some MPI function has returned an error code well before the call to MPI_Bcast().