0
votes

I get a seg-fault applying valgrind to a very simple MPI program:

#include "mpi.h"
#include <iostream>
#include<stdio.h>
#include<stdlib.h>

int main(int argc, char *argv[])
{
    // Initialize parallel
    int rank, numProcess;
    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &numProcess);

    std::cout << "Hello world, - Rank " << rank << "\n";

    MPI_Finalize();
    return 0;
}

Calling mpirun -np 2 ./mpi_test works as expected. However, mpirun -np 2 valgrind ./mpi_test returns a long list of errors and does not say Hello world. I am aware that valgrind can detect false positives in MPI, but here it will not even run a simple hello world program. Below is the errors I get.

==85595== Memcheck, a memory error detector
==85595== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==85595== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==85595== Command: ./mpi_test
==85595== 
==85596== Memcheck, a memory error detector
==85596== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==85596== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==85596== Command: ./mpi_test
==85596== 
==85596== Syscall param msg->desc.port.name points to uninitialised byte(s)
==85596==    at 0x10070B34A: mach_msg_trap (in /usr/lib/system/libsystem_kernel.dylib)
==85596==    by 0x10070A796: mach_msg (in /usr/lib/system/libsystem_kernel.dylib)
==85596==    by 0x100704485: task_set_special_port (in /usr/lib/system/libsystem_kernel.dylib)
==85596==    by 0x1008A010E: _os_trace_create_debug_control_port (in /usr/lib/system/libsystem_trace.dylib)
==85596==    by 0x1008A0458: _libtrace_init (in /usr/lib/system/libsystem_trace.dylib)
==85596==    by 0x10026B9DF: libSystem_initializer (in /usr/lib/libSystem.B.dylib)
==85596==    by 0x10001AA1A: ImageLoaderMachO::doModInitFunctions(ImageLoader::LinkContext const&) (in /usr/lib/dyld)
==85596==    by 0x10001AC1D: ImageLoaderMachO::doInitialization(ImageLoader::LinkContext const&) (in /usr/lib/dyld)
==85596==    by 0x1000164A9: ImageLoader::recursiveInitialization(ImageLoader::LinkContext const&, unsigned int, char const*, ImageLoader::InitializerTimingList&, ImageLoader::UninitedUpwards&) (in /usr/lib/dyld)
==85596==    by 0x100016440: ImageLoader::recursiveInitialization(ImageLoader::LinkContext const&, unsigned int, char const*, ImageLoader::InitializerTimingList&, ImageLoader::UninitedUpwards&) (in /usr/lib/dyld)
==85596==    by 0x100015523: ImageLoader::processInitializers(ImageLoader::LinkContext const&, unsigned int, ImageLoader::InitializerTimingList&, ImageLoader::UninitedUpwards&) (in /usr/lib/dyld)
==85596==    by 0x1000155B8: ImageLoader::runInitializers(ImageLoader::LinkContext const&, ImageLoader::InitializerTimingList&) (in /usr/lib/dyld)
==85596==  Address 0x10488d25c is on thread 1's stack
==85596==  in frame #2, created by task_set_special_port (???:)
==85596== 
==85595== Syscall param msg->desc.port.name points to uninitialised byte(s)
==85595==    at 0x10070B34A: mach_msg_trap (in /usr/lib/system/libsystem_kernel.dylib)
==85595==    by 0x10070A796: mach_msg (in /usr/lib/system/libsystem_kernel.dylib)
==85595==    by 0x100704485: task_set_special_port (in /usr/lib/system/libsystem_kernel.dylib)
==85595==    by 0x1008A010E: _os_trace_create_debug_control_port (in /usr/lib/system/libsystem_trace.dylib)
==85595==    by 0x1008A0458: _libtrace_init (in /usr/lib/system/libsystem_trace.dylib)
==85595==    by 0x10026B9DF: libSystem_initializer (in /usr/lib/libSystem.B.dylib)
==85595==    by 0x10001AA1A: ImageLoaderMachO::doModInitFunctions(ImageLoader::LinkContext const&) (in /usr/lib/dyld)
==85595==    by 0x10001AC1D: ImageLoaderMachO::doInitialization(ImageLoader::LinkContext const&) (in /usr/lib/dyld)
==85595==    by 0x1000164A9: ImageLoader::recursiveInitialization(ImageLoader::LinkContext const&, unsigned int, char const*, ImageLoader::InitializerTimingList&, ImageLoader::UninitedUpwards&) (in /usr/lib/dyld)
==85595==    by 0x100016440: ImageLoader::recursiveInitialization(ImageLoader::LinkContext const&, unsigned int, char const*, ImageLoader::InitializerTimingList&, ImageLoader::UninitedUpwards&) (in /usr/lib/dyld)
==85595==    by 0x100015523: ImageLoader::processInitializers(ImageLoader::LinkContext const&, unsigned int, ImageLoader::InitializerTimingList&, ImageLoader::UninitedUpwards&) (in /usr/lib/dyld)
==85595==    by 0x1000155B8: ImageLoader::runInitializers(ImageLoader::LinkContext const&, ImageLoader::InitializerTimingList&) (in /usr/lib/dyld)
==85595==  Address 0x10488d25c is on thread 1's stack
==85595==  in frame #2, created by task_set_special_port (???:)
==85595== 
--85595-- UNKNOWN task message [id 3445, to mach_task_self(), reply 0x707]
--85596-- UNKNOWN task message [id 3445, to mach_task_self(), reply 0x707]
--85595-- UNKNOWN task message [id 3445, to mach_task_self(), reply 0x707] (repeated 2 times)
--85596-- UNKNOWN task message [id 3445, to mach_task_self(), reply 0x707] (repeated 2 times)
--85596-- UNKNOWN mach_msg unhandled MACH_SEND_TRAILER option
--85595-- UNKNOWN mach_msg unhandled MACH_SEND_TRAILER option
--85596-- UNKNOWN mach_msg unhandled MACH_SEND_TRAILER option (repeated 2 times)
--85595-- UNKNOWN mach_msg unhandled MACH_SEND_TRAILER option (repeated 2 times)
--85596-- UNKNOWN mach_msg unhandled MACH_SEND_TRAILER option (repeated 4 times)
--85595-- UNKNOWN mach_msg unhandled MACH_SEND_TRAILER option (repeated 4 times)
--85595-- UNKNOWN mach_msg unhandled MACH_SEND_TRAILER option (repeated 8 times)
--85596-- UNKNOWN mach_msg unhandled MACH_SEND_TRAILER option (repeated 8 times)
==85595== Thread 2:
==85595== Invalid read of size 4
==85595==    at 0x100868899: _pthread_body (in /usr/lib/system/libsystem_pthread.dylib)
==85595==    by 0x100868886: _pthread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85595==    by 0x10086808C: thread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85595==  Address 0x18 is not stack'd, malloc'd or (recently) free'd
==85595== 
==85595== Invalid read of size 8
==85595==    at 0x100866435: _pthread_mutex_lock_slow (in /usr/lib/system/libsystem_pthread.dylib)
==85595==    by 0x100560117: dyldGlobalLockAcquire() (in /usr/lib/system/libdyld.dylib)
==85595==    by 0x100021F95: ImageLoaderMachOCompressed::doBindFastLazySymbol(unsigned int, ImageLoader::LinkContext const&, void (*)(), void (*)()) (in /usr/lib/dyld)
==85595==    by 0x10000986C: dyld::fastBindLazySymbol(ImageLoader**, unsigned long) (in /usr/lib/dyld)
==85595==    by 0x100560281: dyld_stub_binder (in /usr/lib/system/libdyld.dylib)
==85595==    by 0x100382977: ??? (in /usr/local/Cellar/open-mpi/3.0.0_2/lib/libopen-pal.40.dylib)
==85595==    by 0x25805BBB1: ???
==85595==    by 0x100868886: _pthread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85595==    by 0x10086808C: thread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85595==  Address 0x0 is not stack'd, malloc'd or (recently) free'd
==85595== 
==85595== 
==85595== Process terminating with default action of signal 11 (SIGSEGV)
==85595==  Access not within mapped region at address 0x0
==85595==    at 0x100866435: _pthread_mutex_lock_slow (in /usr/lib/system/libsystem_pthread.dylib)
==85595==    by 0x100560117: dyldGlobalLockAcquire() (in /usr/lib/system/libdyld.dylib)
==85595==    by 0x100021F95: ImageLoaderMachOCompressed::doBindFastLazySymbol(unsigned int, ImageLoader::LinkContext const&, void (*)(), void (*)()) (in /usr/lib/dyld)
==85595==    by 0x10000986C: dyld::fastBindLazySymbol(ImageLoader**, unsigned long) (in /usr/lib/dyld)
==85595==    by 0x100560281: dyld_stub_binder (in /usr/lib/system/libdyld.dylib)
==85595==    by 0x100382977: ??? (in /usr/local/Cellar/open-mpi/3.0.0_2/lib/libopen-pal.40.dylib)
==85595==    by 0x25805BBB1: ???
==85595==    by 0x100868886: _pthread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85595==    by 0x10086808C: thread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85595==  If you believe this happened as a result of a stack
==85595==  overflow in your program's main thread (unlikely but
==85595==  possible), you can try to increase the size of the
==85595==  main thread stack using the --main-stacksize= flag.
==85595==  The main thread stack size used in this run was 8388608.
--85595:0:schedule VG_(sema_down): read returned -4
==85595== 
==85595== HEAP SUMMARY:
==85595==     in use at exit: 358,368 bytes in 3,295 blocks
==85595==   total heap usage: 5,625 allocs, 2,330 frees, 721,547 bytes allocated
==85595== 
==85596== Thread 2:
==85596== Invalid read of size 4
==85596==    at 0x100868899: _pthread_body (in /usr/lib/system/libsystem_pthread.dylib)
==85596==    by 0x100868886: _pthread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85596==    by 0x10086808C: thread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85596==  Address 0x18 is not stack'd, malloc'd or (recently) free'd
==85596== 
==85596== Invalid read of size 8
==85596==    at 0x100866435: _pthread_mutex_lock_slow (in /usr/lib/system/libsystem_pthread.dylib)
==85596==    by 0x100560117: dyldGlobalLockAcquire() (in /usr/lib/system/libdyld.dylib)
==85596==    by 0x100021F95: ImageLoaderMachOCompressed::doBindFastLazySymbol(unsigned int, ImageLoader::LinkContext const&, void (*)(), void (*)()) (in /usr/lib/dyld)
==85596==    by 0x10000986C: dyld::fastBindLazySymbol(ImageLoader**, unsigned long) (in /usr/lib/dyld)
==85596==    by 0x100560281: dyld_stub_binder (in /usr/lib/system/libdyld.dylib)
==85596==    by 0x100382977: ??? (in /usr/local/Cellar/open-mpi/3.0.0_2/lib/libopen-pal.40.dylib)
==85596==    by 0x25805BBB1: ???
==85596==    by 0x100868886: _pthread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85596==    by 0x10086808C: thread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85596==  Address 0x0 is not stack'd, malloc'd or (recently) free'd
==85596== 
==85596== 
==85596== Process terminating with default action of signal 11 (SIGSEGV)
==85596==  Access not within mapped region at address 0x0
==85596==    at 0x100866435: _pthread_mutex_lock_slow (in /usr/lib/system/libsystem_pthread.dylib)
==85596==    by 0x100560117: dyldGlobalLockAcquire() (in /usr/lib/system/libdyld.dylib)
==85596==    by 0x100021F95: ImageLoaderMachOCompressed::doBindFastLazySymbol(unsigned int, ImageLoader::LinkContext const&, void (*)(), void (*)()) (in /usr/lib/dyld)
==85596==    by 0x10000986C: dyld::fastBindLazySymbol(ImageLoader**, unsigned long) (in /usr/lib/dyld)
==85596==    by 0x100560281: dyld_stub_binder (in /usr/lib/system/libdyld.dylib)
==85596==    by 0x100382977: ??? (in /usr/local/Cellar/open-mpi/3.0.0_2/lib/libopen-pal.40.dylib)
==85596==    by 0x25805BBB1: ???
==85596==    by 0x100868886: _pthread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85596==    by 0x10086808C: thread_start (in /usr/lib/system/libsystem_pthread.dylib)
==85596==  If you believe this happened as a result of a stack
==85596==  overflow in your program's main thread (unlikely but
==85596==  possible), you can try to increase the size of the
==85596==  main thread stack using the --main-stacksize= flag.
==85596==  The main thread stack size used in this run was 8388608.
--85596:0:schedule VG_(sema_down): read returned -4
==85596== 
==85596== HEAP SUMMARY:
==85596==     in use at exit: 358,368 bytes in 3,295 blocks
==85596==   total heap usage: 5,625 allocs, 2,330 frees, 721,547 bytes allocated
==85596== 
==85595== LEAK SUMMARY:
==85595==    definitely lost: 9,159 bytes in 47 blocks
==85595==    indirectly lost: 8,112 bytes in 111 blocks
==85595==      possibly lost: 0 bytes in 0 blocks
==85595==    still reachable: 325,270 bytes in 2,982 blocks
==85595==         suppressed: 15,827 bytes in 155 blocks
==85595== Rerun with --leak-check=full to see details of leaked memory
==85595== 
==85595== For counts of detected and suppressed errors, rerun with: -v
==85595== Use --track-origins=yes to see where uninitialised values come from
==85595== ERROR SUMMARY: 3 errors from 3 contexts (suppressed: 1 from 1)
==85596== LEAK SUMMARY:
==85596==    definitely lost: 3,839 bytes in 1 blocks
==85596==    indirectly lost: 0 bytes in 0 blocks
==85596==      possibly lost: 72 bytes in 3 blocks
==85596==    still reachable: 336,638 bytes in 3,138 blocks
==85596==         suppressed: 17,819 bytes in 153 blocks
==85596== Rerun with --leak-check=full to see details of leaked memory
==85596== 
==85596== For counts of detected and suppressed errors, rerun with: -v
==85596== Use --track-origins=yes to see where uninitialised values come from
==85596== ERROR SUMMARY: 3 errors from 3 contexts (suppressed: 1 from 1)
-------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
-------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node cu-vpn-colorado-edu-198 exited on signal 11 (Segmentation fault: 11).
--------------------------------------------------------------------------
1
#include "/usr/local/include/mpi.h" Ouch, really?Lightness Races in Orbit
Looks to me like this is just an incompatibility.Lightness Races in Orbit
@LightnessRacesinOrbit I don't follow? Included the path to make sure I was using an mpi.h compatible with my mpi compiler. It's actually a pointer to /usr/local/Cellar where brew installed it. For compatibility, is valgrind only compatible with some kinds of mpi compiler, or do you mean something else?Ben Southworth
Writing an absolute path in an #include directive is very unidiomatic and gives your program a short shelf life. If you require a non-standard include path you should pass -I/usr/local/Cellar in your compilation command, and just #include "mpi.h" as normal.Lightness Races in Orbit
the MPI way is to #include <mpi.h> and then uses the mpi{cc,cpc,fort} wrappers so you do not have to worry about include and library paths, nor which library should be linked.Gilles Gouaillardet

1 Answers

1
votes

I would be suspecting that there is something up with Valgrind and possible your OS.

As I can compile and run your program without any issues on Linux:

tb-xps ../tmp$ mpicxx -o h2 h2.cpp
tb-xps ../tmp$ mpirun -n 1 ./h2
Hello world, - Rank 0
tb-xps ../tmp$ mpirun -n 1 valgrind ./h2
==3544== Memcheck, a memory error detector
==3544== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==3544== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==3544== Command: ./h2
==3544== 
==3544== Thread 3:
==3544== Syscall param epoll_pwait(sigmask) points to unaddressable byte(s)
==3544==    at 0x5F5AFE6: epoll_pwait (in /usr/lib/libc-2.26.so)
==3544==    by 0x6536DDC: ??? (in /usr/lib/openmpi/libopen-pal.so.40.0.0)
==3544==    by 0x653AEDA: opal_libevent2022_event_base_loop (in /usr/lib/openmpi/libopen-pal.so.40.0.0)
==3544==    by 0x90CA0CE: ??? (in /usr/lib/openmpi/openmpi/mca_pmix_pmix2x.so)
==3544==    by 0x5C4E08B: start_thread (in /usr/lib/libpthread-2.26.so)
==3544==    by 0x5F5AE7E: clone (in /usr/lib/libc-2.26.so)
==3544==  Address 0x0 is not stack'd, malloc'd or (recently) free'd
==3544== 
Hello world, - Rank 0
==3544== 
==3544== HEAP SUMMARY:
==3544==     in use at exit: 1,899 bytes in 44 blocks
==3544==   total heap usage: 17,910 allocs, 17,866 frees, 3,993,061 bytes allocated
==3544== 
==3544== LEAK SUMMARY:
==3544==    definitely lost: 372 bytes in 4 blocks
==3544==    indirectly lost: 1,288 bytes in 34 blocks
==3544==      possibly lost: 0 bytes in 0 blocks
==3544==    still reachable: 239 bytes in 6 blocks
==3544==         suppressed: 0 bytes in 0 blocks
==3544== Rerun with --leak-check=full to see details of leaked memory
==3544== 
==3544== For counts of detected and suppressed errors, rerun with: -v
==3544== ERROR SUMMARY: 39 errors from 1 contexts (suppressed: 0 from 0)
tb-xps ../tmp$ uname -a
Linux tb-xps 4.15.3-1-ARCH #1 SMP PREEMPT Mon Feb 12 23:01:17 UTC 2018 x86_64 GNU/Linux

Also since I am not a C++ programmer, I'd get rid of the iostream and just use printf, I'd also re-arrange my includes and add some white-spaces:

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h> 

Do you have access to a different machine? Are you able to ask a Valgrind mailing list?