Topic: Matrix Multiply Speed Test
Jeff,
This is an informative message regarding 2D-matrix multiplication speed using various approaches.
I was a little surprised at the difference in speed between MatMMult and MatNMult routines, and how slow INTRINSIC DOT_PRODUCT appears to be compared to INTRINSIC MATMUL procedure.
I've attached the SF Speed Test Program in the hope that it may be informative for other users of SF.
Frank
!--- TEST SPEED OF 2-D MATRIX MULTIPLICATION
!
!--- TEST RESULTS (DELL 64-BIT WINDOWS 7)
!----------------------------------------
! MMult
! D(:,:) = MATMUL(A,B)
! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
! start time: 0.00000000
! A[ 4096, 16 ]
! B[ 16, 4096 ]
! finish time: 1.76281000
! Run time = 0.35256200 seconds.
! Relative run time: 1.00000000
!
! MatMMult
! D(1:M,I) = D(1:M,I) + A(1:M,J)*B(J,I)
! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
! start time: 1.76281000
! finish time: 5.05443200
! Run time = 0.65832440 seconds.
! Relative run time: 1.86725852
!
! MatDOTMult ( USES DOT_PRODUCT )
! D(I,J) = DOT_PRODUCT(A(I,1:N),B(1:N,J))
! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
! start time: 5.05443200
! finish time: 10.71726800
! Run time = 1.13256720 seconds.
! Relative run time: 3.21239158
!
! MatMult
! D(I,J) = D(I,J) + A(I,K)*B(K,J)
! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
! start time: 10.71726800
! finish time: 33.75861500
! Run time = 4.60826940 seconds.
! Relative run time: 13.07080570
!
! MatNMult
! D(I,1:M) = D(I,1:M) + A(I,J)*B(J,1:M)
! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
! start time: 33.75861500
! finish time: 65.45801800
! Run time = 6.33988060 seconds.
! Relative run time: 17.98231403
!
PROGRAM MAIN
IMPLICIT NONE
INTEGER*4 :: I, J, K
INTEGER*4 :: N, NR, NC, ND
REAL*8 :: A, B, C, D, E
REAL*8 :: START, FINISH, BEST, R
ALLOCATABLE A(:,:), B(:,:)
ALLOCATABLE C(:,:), D(:,:)
ALLOCATABLE E(:,:)
!--- RANDOM INPUT MATRIX ROWS AND COLUMNS
!--- 4096 X 16 = 65536
!--- 256 X 256 = 65536
NR = 4096 !256 ! NUMBER OF ROWS
NC = 16 !256 ! NUMBER OF COLUMNS
ND = 1 ! START OF INPUT ARRAY
N = 5 ! NUMBER OF CYCLES
!--- DEFINE ARRAYS
ALLOCATE ( A(NR,NC), C(NR,NC) )
ALLOCATE ( B(NC,NR), D(NC,NR) )
ALLOCATE ( E(NR,NR) )
!--- RANDOMIZE INITIAL 2-D MATRICES A AND B
CALL cpu_time(R)
K = 0 !INT( 10000.D0 * R )
A(:,:) = 1.D0
B(:,:) = 1.D0
DO I = 1, NR
DO J = 1, NC
A(I,J) = A(I,J) * 1.4423D-01*(RAND(K) - 0.5D0)
B(J,I) = B(J,I) * 2.4423D-01*(RAND(K) - 0.5D0)
END DO
END DO
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MMULT( A, B )
WRITE(*,'(1X,A)') 'MMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- INTRINSIC MATRIX MULTIPLY
CALL MMult(C,D,NR,NC,E)
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A2,I5,A1,I5,A2)') 'A[',SIZE(A,1),',',SIZE(A,2),' ]'
WRITE(*,'(1X,A2,I5,A1,I5,A2)') 'B[',SIZE(B,1),',',SIZE(B,2),' ]'
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
BEST = finish - start
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MATMMULT( A, B )
WRITE(*,'(1X,A)') 'MatMMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- CODED MATRIX MULTIPLY
E(:,:) = 0.D0
CALL MatMMult( C, D, NR, NC, E )
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MATDOTMULT( A, B )
WRITE(*,'(1X,A)') 'MatDOTMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- CODED MATRIX MULTIPLY
E(:,:) = 0.D0
CALL MatDotMult( C, D, NR, NC, E )
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MATMULT( A, B )
WRITE(*,'(1X,A)') 'MatMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- CODED MATRIX MULTIPLY
E(:,:) = 0.D0
CALL MatMult( C, D, NR, NC, E )
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MATNMULT( A, B )
WRITE(*,'(1X,A)') 'MatNMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- CODED MATRIX MULTIPLY
E(:,:) = 0.D0
CALL MatNMult( C, D, NR, NC, E )
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
WRITE(*,*) REPEAT('~',30)
WRITE(*,'(A//)') 'End of Run...'
DEALLOCATE ( A, B, C, D, E )
END PROGRAM MAIN
!****************************************************************
! MATRIX MULTIPLY A x B SUBROUTINES
!
!****************************************************************
SUBROUTINE MMult(A,B,M,N,C)
!--- INTRINSIC MATRIX MULTIPLY
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN) :: M, N
REAL*8, INTENT(IN) :: A(M,N), B(N,M)
REAL*8, INTENT(OUT) :: C(M,M)
REAL*8 :: D
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
D(:,:) = MATMUL( A, B )
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MMult
!****************************************************************
SUBROUTINE MatMult(A,B,M,N,C)
!--- Square matrix where m = n
!--- A[m,n] x B[n,m] = C[m,m] using index order I, J, and K.
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN) :: M, N
REAL*8, INTENT(IN) :: A(M,N), B(N,M)
REAL*8, INTENT(OUT) :: C(M,M)
REAL*8 :: D
INTEGER*4 :: I, J, K
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
DO I = 1, M
DO J = 1, M
DO K = 1, N
D(I,J) = D(I,J) + A(I,K)*B(K,J)
END DO
END DO
END DO
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MATMult
!****************************************************************
SUBROUTINE MatNMult(A,B,M,N,C)
!--- Multiiplies A = B*C using implicit K index in 2nd dimension
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN) :: M, N
REAL*8, INTENT(IN) :: A(M,N), B(N,M)
REAL*8, INTENT(OUT) :: C(M,M)
REAL*8 :: D
INTEGER*4 :: I, J
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
DO I = 1, M
DO J = 1, N
D(I,1:M) = D(I,1:M) + A(I,J)*B(J,1:M)
END DO
END DO
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MatNMult
!****************************************************************
SUBROUTINE MatMMult(A,B,M,N,C)
!--- Multiiplies A = B*C using implicit K index in 1st dimension
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN) :: M, N
REAL*8, INTENT(IN) :: A(M,N), B(N,M)
REAL*8, INTENT(OUT) :: C(M,M)
REAL*8 :: D
INTEGER*4 :: I, J
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
DO I = 1, M
DO J = 1, N
D(1:M,I) = D(1:M,I) + A(1:M,J)*B(J,I)
END DO
END DO
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MatMMult
!****************************************************************
SUBROUTINE MatDotMult(A,B,M,N,C)
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN) :: M, N
REAL*8, INTENT(IN) :: A(M,N), B(N,M)
REAL*8, INTENT(OUT) :: C(M,M)
REAL*8 :: D
INTEGER*4 :: I, J
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
DO I = 1, M
DO J = 1, M
D(I,J) = DOT_PRODUCT( A(I,1:N), B(1:N,J) )
END DO
END DO
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MatDotMult
!****************************************************************
! END OF MATRIX MULTIPLY SUBROUTINES
!****************************************************************