#### Topic: Matrix Multiply Speed Test

Jeff,
This is an informative message regarding 2D-matrix multiplication speed using various approaches.
I was a little surprised at the difference in speed between MatMMult and MatNMult routines, and how slow INTRINSIC DOT_PRODUCT appears to be compared to INTRINSIC MATMUL procedure.

I've attached the SF Speed Test Program in the hope that it may be informative for other users of SF.

Frank

!--- TEST SPEED OF 2-D MATRIX MULTIPLICATION
!
!--- TEST RESULTS (DELL 64-BIT WINDOWS 7)
!----------------------------------------
!    MMult
!    D(:,:) = MATMUL(A,B)
!    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!    start time:  0.00000000
!    A[ 4096,   16 ]
!    B[   16, 4096 ]
!    finish time:  1.76281000
!    Run time =   0.35256200 seconds.
!    Relative run time:  1.00000000
!
!    MatMMult
!    D(1:M,I) = D(1:M,I) + A(1:M,J)*B(J,I)
!    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!    start time:  1.76281000
!    finish time:  5.05443200
!    Run time =   0.65832440 seconds.
!    Relative run time:  1.86725852
!
!    MatDOTMult ( USES DOT_PRODUCT )
!    D(I,J) = DOT_PRODUCT(A(I,1:N),B(1:N,J))
!    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!    start time:  5.05443200
!    finish time: 10.71726800
!    Run time =   1.13256720 seconds.
!    Relative run time:  3.21239158
!
!    MatMult
!    D(I,J) = D(I,J) + A(I,K)*B(K,J)
!    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!    start time: 10.71726800
!    finish time: 33.75861500
!    Run time =   4.60826940 seconds.
!    Relative run time: 13.07080570
!
!    MatNMult
!    D(I,1:M) = D(I,1:M) + A(I,J)*B(J,1:M)
!    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!    start time: 33.75861500
!    finish time: 65.45801800
!    Run time =   6.33988060 seconds.
!    Relative run time: 17.98231403
!
PROGRAM MAIN

IMPLICIT NONE

INTEGER*4 :: I, J, K
INTEGER*4 :: N, NR, NC, ND
REAL*8 :: A, B, C, D, E
REAL*8 :: START, FINISH, BEST, R

ALLOCATABLE A(:,:), B(:,:)
ALLOCATABLE C(:,:), D(:,:)
ALLOCATABLE E(:,:)

!--- RANDOM INPUT MATRIX ROWS AND COLUMNS
!--- 4096 X  16 = 65536
!---  256 X 256 = 65536
NR = 4096    !256     ! NUMBER OF ROWS
NC = 16      !256     ! NUMBER OF COLUMNS
ND = 1         ! START OF INPUT ARRAY
N = 5          ! NUMBER OF CYCLES

!--- DEFINE ARRAYS
ALLOCATE ( A(NR,NC), C(NR,NC) )
ALLOCATE ( B(NC,NR), D(NC,NR) )
ALLOCATE ( E(NR,NR) )

!--- RANDOMIZE INITIAL 2-D MATRICES A AND B
CALL cpu_time(R)
K = 0   !INT( 10000.D0 * R )
A(:,:) = 1.D0
B(:,:) = 1.D0
DO I = 1, NR
DO J = 1, NC
A(I,J) = A(I,J)  * 1.4423D-01*(RAND(K) - 0.5D0)
B(J,I) = B(J,I)  * 2.4423D-01*(RAND(K) - 0.5D0)
END DO
END DO

!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MMULT( A, B )
WRITE(*,'(1X,A)') 'MMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- INTRINSIC MATRIX MULTIPLY
CALL MMult(C,D,NR,NC,E)
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A2,I5,A1,I5,A2)') 'A[',SIZE(A,1),',',SIZE(A,2),' ]'
WRITE(*,'(1X,A2,I5,A1,I5,A2)') 'B[',SIZE(B,1),',',SIZE(B,2),' ]'
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
BEST = finish - start
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MATMMULT( A, B )
WRITE(*,'(1X,A)') 'MatMMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- CODED MATRIX MULTIPLY
E(:,:) = 0.D0
CALL MatMMult( C, D, NR, NC, E )
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MATDOTMULT( A, B )
WRITE(*,'(1X,A)') 'MatDOTMult'
WRITE(*,*) REPEAT('~',30)

!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- CODED MATRIX MULTIPLY
E(:,:) = 0.D0
CALL MatDotMult( C, D, NR, NC, E )
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MATMULT( A, B )
WRITE(*,'(1X,A)') 'MatMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- CODED MATRIX MULTIPLY
E(:,:) = 0.D0
CALL MatMult( C, D, NR, NC, E )
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!--- MATNMULT( A, B )
WRITE(*,'(1X,A)') 'MatNMult'
WRITE(*,*) REPEAT('~',30)
!--- COPY INITIAL MATRIX A[ , ]
C(:,:) = A(:,:)
!--- COPY INITIAL MATRIX B[ , ]
D(:,:) = B(:,:)
call cpu_time(START)
WRITE(*,'(1X,A,F12.8)') 'start time:', START
DO K = 1, N
!--- CODED MATRIX MULTIPLY
E(:,:) = 0.D0
CALL MatNMult( C, D, NR, NC, E )
END DO
call cpu_time(FINISH)
WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
WRITE(*,*) REPEAT('~',30)
WRITE(*,'(A//)') 'End of Run...'

DEALLOCATE ( A, B, C, D, E )

END PROGRAM MAIN

!****************************************************************
!       MATRIX MULTIPLY A x B SUBROUTINES
!
!****************************************************************
SUBROUTINE MMult(A,B,M,N,C)
!--- INTRINSIC MATRIX MULTIPLY
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN)   :: M, N
REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
REAL*8, INTENT(OUT)     :: C(M,M)
REAL*8 :: D
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
D(:,:) = MATMUL( A, B )
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MMult

!****************************************************************
SUBROUTINE MatMult(A,B,M,N,C)
!--- Square matrix where m = n
!--- A[m,n] x B[n,m] = C[m,m] using index order I, J, and K.
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN)   :: M, N
REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
REAL*8, INTENT(OUT)     :: C(M,M)
REAL*8 :: D
INTEGER*4 :: I, J, K
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
DO I = 1, M
DO J = 1, M
DO K = 1, N
D(I,J) = D(I,J) + A(I,K)*B(K,J)
END DO
END DO
END DO
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MATMult

!****************************************************************
SUBROUTINE MatNMult(A,B,M,N,C)
!--- Multiiplies A = B*C using implicit K index in 2nd dimension
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN)   :: M, N
REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
REAL*8, INTENT(OUT)     :: C(M,M)
REAL*8 :: D
INTEGER*4 :: I, J
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
DO I = 1, M
DO J = 1, N
D(I,1:M) = D(I,1:M) + A(I,J)*B(J,1:M)
END DO
END DO
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MatNMult

!****************************************************************
SUBROUTINE MatMMult(A,B,M,N,C)
!--- Multiiplies A = B*C using implicit K index in 1st dimension
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN)   :: M, N
REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
REAL*8, INTENT(OUT)     :: C(M,M)
REAL*8 :: D
INTEGER*4 :: I, J
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
DO I = 1, M
DO J = 1, N
D(1:M,I) = D(1:M,I) + A(1:M,J)*B(J,I)
END DO
END DO
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MatMMult

!****************************************************************
SUBROUTINE MatDotMult(A,B,M,N,C)
!****************************************************************
IMPLICIT NONE
INTEGER*4, INTENT(IN)   :: M, N
REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
REAL*8, INTENT(OUT)     :: C(M,M)
REAL*8 :: D
INTEGER*4 :: I, J
ALLOCATABLE :: D(:,:)
ALLOCATE ( D(M,M) )
D(:,:) = 0.0D0
DO I = 1, M
DO J = 1, M
D(I,J) = DOT_PRODUCT( A(I,1:N), B(1:N,J) )
END DO
END DO
!--- COPY ZERO MATRIX D[] TO C[]
C(:,:) = D(:,:)
DEALLOCATE ( D )
RETURN
END SUBROUTINE MatDotMult
!****************************************************************
!       END OF MATRIX MULTIPLY SUBROUTINES
!****************************************************************

#### Re: Matrix Multiply Speed Test

Frank,

Thank you for the interesting program!  I'll have to try it myself.  Our runtime library that is currently included is not particularly optimized to ensure portability, though we're looking at changing that.

You might get an additional speed boost by specifying the compiler flag

``-fexternal-blas``

and enabling LAPACK and BLAS in Project Options.  However, I'm not sure the gains would be particularly great.

Jeff Armstrong
Approximatrix, LLC