I am using a CFD code, it is based on Opnempi. When I use only one core to run it, there are no problems. But when I use more cores to run it, I got errors like this:
[DESKTOP-7D2F3AN:03839] Process received signal
[DESKTOP-7D2F3AN:03839] Signal: Segmentation fault (11)
[DESKTOP-7D2F3AN:03839] Signal code: (128)
[DESKTOP-7D2F3AN:03839] Failing at address: (nil)
[DESKTOP-7D2F3AN:03839] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x12890) [0x7f405efe2890]
[DESKTOP-7D2F3AN:03839] [ 1] /lib/x86_64-linux-gnu/libc.so.6(+0x18ec3c) [0x7f405ed5ec3c]
[DESKTOP-7D2F3AN:03839] [ 2] /usr/local/lib/libmpi.so.0(ompi_convertor_pack+0x196) [0x7f405f462176]
[DESKTOP-7D2F3AN:03839] [ 3] /usr/local/lib/openmpi/mca_pml_ob1.so(+0x10851) [0x7f405b780851]
[DESKTOP-7D2F3AN:03839] [ 4] /usr/local/lib/openmpi/mca_pml_ob1.so(+0x53ee) [0x7f405b7753ee]
[DESKTOP-7D2F3AN:03839] [ 5] /usr/local/lib/openmpi/mca_coll_tuned.so(+0xb2ff) [0x7f4059e2b2ff]
[DESKTOP-7D2F3AN:03839] [ 6] /usr/local/lib/openmpi/mca_coll_tuned.so(+0xbaf1) [0x7f4059e2baf1]
[DESKTOP-7D2F3AN:03839] [ 7] /usr/local/lib/openmpi/mca_coll_tuned.so(+0x20ab) [0x7f4059e220ab]
[DESKTOP-7D2F3AN:03839] [ 8] /usr/local/lib/openmpi/mca_coll_sync.so(+0x1377) [0x7f405a041377]
[DESKTOP-7D2F3AN:03839] [ 9] /usr/local/lib/libmpi.so.0(MPI_Bcast+0x11d) [0x7f405f46974d]
[DESKTOP-7D2F3AN:03839] [10] ./cgles(com_dist_param+0x4b) [0x7f4060236972]
[DESKTOP-7D2F3AN:03839] [11] ./cgles(main+0x377) [0x7f4060233377]
[DESKTOP-7D2F3AN:03839] [12] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f405ebf1b97]
[DESKTOP-7D2F3AN:03839] [13] ./cgles(_start+0x2a) [0x7f406022254a]
[DESKTOP-7D2F3AN:03839] End of error message
mpirun noticed that process rank 0 with PID 3839 on node DESKTOP-7D2F3AN exited on signal 11 (Segmentation fault).
I don't know why. The code is too long. The main part is like this:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include "param.h"
#include "block.h"
#include "q.h"
#include "map.h"
#include "comms.h"
#include "util.h"
#include "dat.h"
#include "timing.h"
#include "sgm.h"
#include "error.h"
#include "init.h"
#include "mom.h"
#include "inst.h"
#include "pcg.h"
#include "bicgstab.h"
#include "ibm.h"
#include "hist.h"
#include "pcg_ext.h"
#include "dibm.h"
/* using BLAS library for better performance */
#ifdef BLAS
#include <essl.h>
#endif
/* local prototypes */
void cmdopts(int *, char ***);
void mybasename(char *, char *);
void glob_output(void);
void glob_tstep(void);
void getstep(char *fname, double *Prtime, int *Pitime);
void putstep(char *fname, double *Prtime, int *Pitime);
/* external prototypes */
void slice_probe(void);
void print_solution(char* , int);
void print_vert(char*, int);
void print_wave(char*, int);
void print_moments(char*, int);
extern void InitializeIBPoint();
/*void glob_snapshot(char*);*/
extern void dibm_interploate_shear_velocity(void);
/* global, used by mgp.c, pcg.c and map.c */
/* global, used by mgp.c, pcg.c and map.c */
int mypid = -1; /* my process id number */
int npid = 1; /* total process number */
int halt_flg = 0; /* halt t-steps, 1=stop, 0=cont */
double wck_t_start, wck_t_end; /* wall clock timing */
double cpu_t_start, cpu_t_end; /* cpu clock timing */
int time_limit = 999999999; /* wall clock limit (seconds) */
double Y_pro_mu=0; //Mu in Y-code;
double Y_pro_Laimuda=0;//Laimuda in Y-code
//char Y_y3dfile[256]; //*.y3d filename in Y-code
double Y_adheforcefactor=1.0;//force factor applied to ib points on Y solid, used in Yw.c-F2y()
double Y_shear_force_factor =1.0;
double Y_shear_vel = 0.0;
int Y_init_entity_num; //cell number at initilization of Y
//#Additional Misc by XuDong
int stat_y_save_dn; // 1000 #y solid saving interval
int stat_plane_save_dn;// 1000 #plane saving interval
int stat_glob_fluid_save_dn;// 10000000 #global fluid saving interval
int stat_restart_save_dn; // 1000 #Restart file saving interval
int stat_gtk_draw_dn; // 1 #gtkdraw interval when GTK defined
int stat_gtk_save_dn; // 100 #gtkdraw picture saving interval when GTK defined
int stat_y_typic_save_dn;
double case_parameter_1;//undefined reserved case parameter
double case_parameter_2;
double case_parameter_3;
double case_parameter_4;
/* main: driver routine */
int main(int argc, char **argv)
{
int bid, ibid;
/* switches */
int add_seed_flg = 0, probe_flg = 0, putxd_2x_flg = 0, reset_stats_flg = 0;
/* Start up any other nodes. The cmd line options relating to the
* parallel interface are parsed and pruned here. */
mypid = com_start(&argc, &argv, 1);
npid = com_nnodes();
wck_t_start = WCKseconds();
cpu_t_start = CPUseconds();
/* Parse the remaining options [first node only] */
if (mypid == 0)
cmdopts(&argc, &argv);
/* Buffering mode for stdout; needed on T3D */
fflush(stdout);
setvbuf(stdout, (char *) 0, _IOLBF, 0); /* line buffering */
/* Read configuration data on ONE node */
if (mypid == 0)
{
/* get the map data and mesh points */
map_getfn(fnames.map, fnames.msh);
map_print(BID_NULL);
if (map_checkt() || map_checku())
exit(2);
/* debugging */
/* map_putfn("temp.map"); */
/* msh_putfn("temp.msh"); */
/* read the local data file and build lookup table */
dat_read(fnames.dat);
dat_print();
/* Modify parameters using info from *.dat file */
dat_getdouble(&viscm, "viscm");
dat_getdouble(&fbody_x, "fbody_x");
dat_getdouble(&fbody_y, "fbody_y");
dat_getdouble(&fbody_z, "fbody_z");
dat_getint(&time_limit, "time_limit");
//#Additional Misc by XuDong
dat_getdouble(&crk_beta,"crk_beta");// #=0--explicit, ==1 fully implicit ==1/2 crank-nicolson
dat_getdouble(&ibm_relax,"ibm_relax");// #=0.5---normal for direct forcing
dat_getint(&stat_y_save_dn,"stat_y_save_dn");// 1000 #y solid saving interval
dat_getint(&stat_plane_save_dn,"stat_plane_save_dn");// 1000 #plane saving interval
dat_getint(&stat_glob_fluid_save_dn,"stat_glob_fluid_save_dn");// 10000000 #global fluid saving interval
dat_getint(&stat_restart_save_dn,"stat_restart_save_dn");// 1000 #Restart file saving interval
dat_getint(&stat_gtk_draw_dn,"stat_gtk_draw_dn");// 1 #gtkdraw interval when GTK defined
dat_getint(&stat_gtk_save_dn,"stat_gtk_save_dn");// 100 #gtkdraw picture saving interval when GTK defined
dat_getint(&stat_y_typic_save_dn,"stat_y_typic_save_dn");// 100 #gtkdraw picture saving interval when GTK defined
dat_getdouble(&Y_adheforcefactor,"Y_adheforcefactor");// 1 #gtkdraw interval when GTK defined
dat_getdouble(&Y_shear_force_factor,"Y_shear_force_factor");// 100 #gtkdraw picture saving interval when GTK defined
dat_getdouble(&Y_shear_vel,"Y_shear_vel");// 100 #gtkdraw picture saving interval when GTK defined
dat_getint(&Y_init_entity_num,"Y_init_entity_num");// //cell number at initilization of Y
dat_getdouble(&case_parameter_1,"case_parameter_1"); //undefined reserved case parameter
dat_getdouble(&case_parameter_2,"case_parameter_2"); //undefined reserved case parameter
dat_getdouble(&case_parameter_3,"case_parameter_3"); //undefined reserved case parameter
dat_getdouble(&case_parameter_4,"case_parameter_4"); //undefined reserved case parameter
/* set current time step parameters */
dat_getint(&ntime, "ntime");
dat_getdouble(&dt, "dt");
if (!flg_init)
getstep(fnames.xdi, &rtime_current, &itime);
itime_first = itime;
itime_last = itime_first + ntime;
rtime_first = rtime_current;
rtime_last = rtime_current + dt * ntime;
/* print fluid parameters */
prparam();
}
/* Distibute to other nodes */
com_dist_param(); /* distribute file names */
com_dist_map(); /* distribute map */
com_dist_data(); /* distribute table */
I located where the error occured, it is in com_dist_param().
void com_dist_param(void)
{
/* This is all there is to it, curtesy of derived datatypes */
MPI_Bcast(&fnames, 1, fnames_type, 0, MPI_COMM_WORLD);
/* params starts with address of fbody_x */
MPI_Bcast(&fbody_x, 1, params_type, 0, MPI_COMM_WORLD);
}
Errors occured when it came into MPI_Bcast. Fnames:
typedef struct
{
char xdi[MAXSTR]; /* xd in */
char xdo[MAXSTR]; /* xd out */
char dat[MAXSTR]; /* data */
char map[MAXSTR]; /* domain map */
char msh[MAXSTR]; /* mesh points */
char y3d[MAXSTR]; /* Y3D input */
char log[MAXSTR]; /* logging */
char mom[MAXSTR]; /* stats moments */
}
Fnames;
and it defined the fnames_type as MPI_Datatype:
MPI_Datatype
make_fnames_type(void)
{
MPI_Datatype fnames_type;
MPI_Datatype type[8];
int blocklen[8];
MPI_Aint disp[8];
int base, i;
/* define it */
MPI_Address(&fnames.xdi, disp);
MPI_Address(&fnames.xdo, disp + 1);
MPI_Address(&fnames.dat, disp + 2);
MPI_Address(&fnames.map, disp + 3);
MPI_Address(&fnames.msh, disp + 4);
MPI_Address(&fnames.y3d, disp + 5);
MPI_Address(&fnames.log, disp + 6);
MPI_Address(&fnames.mom, disp + 7);
base = disp[0];
for (i = 0; i < 8; i++) {
type[i] = MPI_CHAR;
blocklen[i] = MAXSTR;
disp[i] -= base;
/*
printf("disp[i] = %d blocklen[i] = %d\n", disp[i], blocklen[i]);
*/
}
/* create it */
MPI_Type_struct(8, blocklen, disp, type, &fnames_type);
MPI_Type_commit(&fnames_type);
return (fnames_type);
}
Could anyone help me? Thanks a lot!