I've been trying to run a code using MPI I/O on a large number of cores. The time required for each core to read from and write to a single file (the same for all cores) increases with the number of cores used. I'm currently using 512 cores and this problem is making my project unfeasible. The problem appears, however, even when running on 8 cores; it then takes about 0.2 seconds to read the first real number in the file. On 32 cores it takes more then 30 seconds to write one real number. I'm running it here: https://www.msi.umn.edu/hpc/itasca. The following simple code generates exactly this problem (the counting of the number of elements in the file might seem unnecessary here but it is necessary in my actual code):
PROGRAM MAIN
USE MPI
IMPLICIT NONE
! INITIALIZING VARIABLES
REAL(8) :: A, B
INTEGER :: COUNT_IO, i, j, ST, GO, tag, t, nb_bytes, N, d_each, d_start, d_end, NN
REAL(8) :: time_start, time_end
! VARIABLES RELATED TO MPI
INTEGER :: ierror ! returns error messages from the mpi subroutines
INTEGER :: rank ! identification number of each processor
INTEGER :: nproc ! number of processors
INTEGER, DIMENSION(mpi_status_size):: status
INTEGER(kind= MPI_OFFSET_KIND ) :: offset
INTEGER :: fh ! file handle
! EXECUTABLE
! INITIALIZE THE MPI ENVIRONMENT
CALL MPI_INIT(ierror) ! initialize MPI
CALL MPI_COMM_RANK(MPI_COMM_WORLD,rank,ierror) ! obtain rank for each node
CALL MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierror) ! obtain the number of nodes
CALL MPI_TYPE_SIZE(MPI_REAL8,nb_bytes,ierror)
CALL MPI_FILE_OPEN (MPI_COMM_WORLD,"file.dat",MPI_MODE_RDWR+MPI_MODE_UNIQUE_OPEN,MPI_INFO_NULL,fh,ierror)
NN = 2048
DO d_each=1,NN
IF (d_each*nproc>=NN) EXIT
END DO
d_start = rank*d_each+1
d_end = MIN((rank+1)*d_each,NN)
DO t = d_start,d_end
! READING ONE THREAD AT A TIME
tag = 1
GO = 0
IF (rank .gt. 0) THEN
CALL MPI_RECV (GO,1,MPI_INTEGER,rank-1,tag, MPI_COMM_WORLD ,status,ierror)
ENDIF
time_start = MPI_WTIME()
i = 0
ST = 0
COUNT_IO = 0
DO WHILE ((i .lt. 100000) .AND. (ST .eq. 0))
i = i+1
offset = nb_bytes*(i-1)
CALL MPI_FILE_READ_AT (fh,offset,A,1,MPI_REAL8,status,ierror)
IF (status(1) .eq. 0) THEN
COUNT_IO = i
ST = 1
ELSE
COUNT_IO = 0
END IF
ENDDO
N = (COUNT_IO - 1)
IF (N .gt. 0) THEN
offset = 0
CALL MPI_FILE_READ_AT (fh,offset,B,1,MPI_REAL8,status,ierror)
ENDIF
time_end = MPI_WTIME()
PRINT *, 'My rank is', rank, 'Time for read =',time_end-time_start
GO = 1
IF (rank .lt. nproc-1) THEN
CALL MPI_SEND (GO,1, MPI_INTEGER ,rank+1,tag, MPI_COMM_WORLD ,ierror)
ENDIF
CALL MPI_BARRIER(MPI_COMM_WORLD,ierror)
! WRITING ONE THREAD AT A TIME
tag = 2
GO = 0
IF (rank .gt. 0) THEN
CALL MPI_RECV (GO,1,MPI_INTEGER,rank-1,tag, MPI_COMM_WORLD ,status,ierror)
ENDIF
time_start = MPI_WTIME()
i = 0
ST = 0
COUNT_IO = 0
DO WHILE ((i .lt. 100000) .AND. (ST .eq. 0))
i = i+1
offset = nb_bytes*(i-1)
CALL MPI_FILE_READ_AT (fh,offset,A,1,MPI_REAL8,status,ierror)
IF (status(1) .eq. 0) THEN
COUNT_IO = i
ST = 1
ELSE
COUNT_IO = 0
END IF
ENDDO
N = (COUNT_IO - 1)
offset = nb_bytes*N
CALL MPI_FILE_WRITE_AT (fh,offset,0.0D0,1,MPI_REAL8,status,ierror)
time_end = MPI_WTIME()
PRINT *, 'My rank is', rank, 'Time for write =',time_end-time_start
GO = 1
IF (rank .lt. nproc-1) THEN
CALL MPI_SEND (GO,1, MPI_INTEGER ,rank+1,tag, MPI_COMM_WORLD ,ierror)
ENDIF
CALL MPI_BARRIER(MPI_COMM_WORLD,ierror)
ENDDO
CALL MPI_FILE_CLOSE (fh,ierror)
CALL MPI_FINALIZE(ierror)
END PROGRAM MAIN