I want to run a Fortran program which calls a subroutine that I want to parallelize with MPI. I know this sounds complicated, but I want to be able to specify the number of processes for each call. What I would want to use is a structure like this:
program my_program
implicit none
!Define variables
nprocs = !formula for calculating number of processes.
call my_subroutine(output,nprocs,other input vars)
end my_program
I want to run my_subroutine with the same effect as this:
mpirun -n nprocs my_subroutine.o
where my_subroutine has been compiled with 'other input vars.'
Is this possible?
Here is a simple example. I try compiling as follows: $ mpif90 -o my_program WAVE_2D_FP_TUNER_mpi.f90 randgen.f SIMPLE_ROUTINE.f90 I try to run it like this: $ mpirun -np (1 or 2) my_program
PROGRAM WAVE_2D_FP_TUNER_mpi
USE MPI
IMPLICIT NONE
REAL(KIND=8) :: T,PARAM(1:3),Z,ZBQLU01
REAL(KIND=8) :: ERRORS,COSTS,CMAX,CMAX_V(1:1000),THRESHOLD,Z_MIN,Z_MAX
REAL(KIND=8) :: U,S,R(1:6),MATRIX(1:15)
INTEGER :: EN,INC,I,J,M,P
INTEGER :: NPROCS,IERR
!0.8,-0.4,0.4,10,4,4,7 -- [0.003,0.534]
!0.8,-0.2,0.2,10,4,4,7 -- [0.190,0.588]
CALL MPI_INIT(IERR)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD,NPROCS,IERR)
THRESHOLD = 0.D0
EN = 81
INC = 1
Z_MIN = -2.D-1; Z_MAX = 2.D-1
T = 1.D0
PARAM(1) = 10.D0; PARAM(2) = 4.D0; PARAM(3) = 4.D0
CMAX = 7.D0 !Max that wave speed could possibly be.
CALL ZBQLINI(0.D0)
OPEN(UNIT = 1, FILE = "TUNER_F.txt")
WRITE(1,*) 'Grid Size: '
WRITE(1,*) T/(EN-1)
DO P = 1,15
S = 0
Z = Z_MIN + (1.d0/(15-1))*dble((P-1))*(Z_MAX - Z_MIN)
WRITE(1,*) 'Z: ',Z
DO I = 1,1000
DO J = 1,6
R(J) = ZBQLU01(0.D0)
END DO
!CALL PDE_WAVE_F_mpi(T,PARAM,R,Z,CMAX,EN,INC,NPROCS,U)
CALL SIMPLE_ROUTINE(T,PARAM,R,Z,CMAX,EN,INC,NPROCS,U)
IF (U<=threshold) THEN
S = S + 1.D0
ELSE
S = S + 0.D0
END IF
END DO
MATRIX(P) = (1.D0/1000)*S
END DO
DO I = 1,15
WRITE(1,*) MATRIX(I)
END DO
PRINT *,MINVAL(MATRIX)
PRINT *,MAXVAL(MATRIX)
CLOSE(1)
CALL MPI_FINALIZE(IERR)
END PROGRAM WAVE_2D_FP_TUNER_mpi
Here is the subroutine that I wish to parallelize with mpi.
SUBROUTINE SIMPLE_ROUTINE(T,PARAM,R,Z,CMAX,EN,INC,NPROCS,U)
! Outputs scalar U = T*Z*CMAX*INC*SUM(PARAM)*SUM(R)*SUM(Y)
USE MPI
IMPLICIT NONE
REAL(KIND=8), INTENT(IN) :: T,PARAM(1:3),R(1:6),Z,CMAX
INTEGER, INTENT(IN) :: EN,INC
INTEGER, INTENT(IN) :: NPROCS
REAL(KIND=8), INTENT(OUT) :: U
REAL(KIND=8) :: H,LOCAL_SUM,SUM_OF_X
REAL(KIND=8), DIMENSION(:), ALLOCATABLE :: X
INTEGER :: PX,PX_MAX,NXL,REMX,IX_OFF,P_LEFT,P_RIGHT
INTEGER :: J
INTEGER :: IERR,MYID
! Broadcast nprocs handle to all processes in MPRI_COMM_WORLD
CALL MPI_BCAST(&NPROCS, NPROCS, MPI_INT, 0, MPI_COMM_WORLD,IERR)
! Create subcommunicator SUBCOMM (Do not know how to define WORLD_GROUP?)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,WORLD_GROUP,SUBCOMM,IERR)
! Assign IDs to processes in SUBCOMM
CALL MPI_COMM_RANK(SUBCOMM,MYID,IERR)
! Give NPROCS - 1 to SUBCOMM
CALL MPI_COMM_SIZE(SUBCOMM,NPROCS-1,IERR)
H = 2.D0/(EN-1)
! LABEL THE PROCESSES FROM 1 TO PX_MAX.
PX = MYID + 1
PX_MAX = NPROCS
! SPLIT UP THE GRID IN THE X-DIRECTION.
NXL = EN/PX_MAX !nxl = 10/3 = 3
REMX = EN-NXL*PX_MAX !remx = 10-3*3 = 1
IF (PX .LE. REMX) THEN !for px = 1,nxl = 3
NXL = NXL+1 !nxl = 4
IX_OFF = (PX-1)*NXL !ix_off = 0
ELSE
IX_OFF = REMX*(NXL+1)+(PX-(REMX+1))*NXL !for px = 2 and px = 3, ix_off = 1*(3+1)+(2-(1+1))*3 = 4, ix_off = 1*(3+1)+(3-(1+1))*3 = 7
END IF
! ALLOCATE MEMORY FOR VARIOUS ARRAYS.
ALLOCATE(X(0:NXL+1))
X(:) = (/(-1.D0+DBLE(J-1+IX_OFF)*H, J=1,EN)/)
LOCAL_SUM = SUM(X(1:NXL))
CALL MPI_REDUCE(LOCAL_SUM,SUM_OF_X,1,&
MPI_DOUBLE_PRECISION,MPI_SUM,&
0,MPI_COMM_WORLD,IERR)
U = T*Z*CMAX*INC*SUM(PARAM)*SUM(R)*SUM_OF_X
DEALLOCATE(X)
CALL MPI_COMM_FREE(SUBCOMM,IERR)
CALL MPI_BARRIER(MPI_COMM_WORLD,IERR)
END SUBROUTINE SIMPLE_ROUTINE
Ultimately, I want to be able to change the number of processors used in the subroutine, where I want nprocs to be calculated from the value of EN.