Topic: Matrix Multiply Speed Test

Jeff,
This is an informative message regarding 2D-matrix multiplication speed using various approaches.
I was a little surprised at the difference in speed between MatMMult and MatNMult routines, and how slow INTRINSIC DOT_PRODUCT appears to be compared to INTRINSIC MATMUL procedure.

I've attached the SF Speed Test Program in the hope that it may be informative for other users of SF.

Frank

        !--- TEST SPEED OF 2-D MATRIX MULTIPLICATION
        !
        !--- TEST RESULTS (DELL 64-BIT WINDOWS 7)
        !----------------------------------------
        !    MMult
        !    D(:,:) = MATMUL(A,B)
        !    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !    start time:  0.00000000
        !    A[ 4096,   16 ]
        !    B[   16, 4096 ]
        !    finish time:  1.76281000
        !    Run time =   0.35256200 seconds.
        !    Relative run time:  1.00000000
        !
        !    MatMMult
        !    D(1:M,I) = D(1:M,I) + A(1:M,J)*B(J,I)
        !    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !    start time:  1.76281000
        !    finish time:  5.05443200
        !    Run time =   0.65832440 seconds.
        !    Relative run time:  1.86725852
        !
        !    MatDOTMult ( USES DOT_PRODUCT )
        !    D(I,J) = DOT_PRODUCT(A(I,1:N),B(1:N,J))
        !    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !    start time:  5.05443200
        !    finish time: 10.71726800
        !    Run time =   1.13256720 seconds.
        !    Relative run time:  3.21239158
        !
        !    MatMult
        !    D(I,J) = D(I,J) + A(I,K)*B(K,J)
        !    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !    start time: 10.71726800
        !    finish time: 33.75861500
        !    Run time =   4.60826940 seconds.
        !    Relative run time: 13.07080570
        !
        !    MatNMult
        !    D(I,1:M) = D(I,1:M) + A(I,J)*B(J,1:M)
        !    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !    start time: 33.75861500
        !    finish time: 65.45801800
        !    Run time =   6.33988060 seconds.
        !    Relative run time: 17.98231403
        !
        PROGRAM MAIN

        IMPLICIT NONE

        INTEGER*4 :: I, J, K
        INTEGER*4 :: N, NR, NC, ND
        REAL*8 :: A, B, C, D, E
        REAL*8 :: START, FINISH, BEST, R

        ALLOCATABLE A(:,:), B(:,:)
        ALLOCATABLE C(:,:), D(:,:)
        ALLOCATABLE E(:,:)

        !--- RANDOM INPUT MATRIX ROWS AND COLUMNS
        !--- 4096 X  16 = 65536
        !---  256 X 256 = 65536
        NR = 4096    !256     ! NUMBER OF ROWS
        NC = 16      !256     ! NUMBER OF COLUMNS
        ND = 1         ! START OF INPUT ARRAY
        N = 5          ! NUMBER OF CYCLES

        !--- DEFINE ARRAYS
        ALLOCATE ( A(NR,NC), C(NR,NC) )
        ALLOCATE ( B(NC,NR), D(NC,NR) )
        ALLOCATE ( E(NR,NR) )

        !--- RANDOMIZE INITIAL 2-D MATRICES A AND B
        CALL cpu_time(R)
        K = 0   !INT( 10000.D0 * R )
        A(:,:) = 1.D0
        B(:,:) = 1.D0
        DO I = 1, NR
            DO J = 1, NC
                A(I,J) = A(I,J)  * 1.4423D-01*(RAND(K) - 0.5D0)
                B(J,I) = B(J,I)  * 2.4423D-01*(RAND(K) - 0.5D0)
            END DO
        END DO

        !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !--- MMULT( A, B )
        WRITE(*,'(1X,A)') 'MMult'
        WRITE(*,*) REPEAT('~',30)
        !--- COPY INITIAL MATRIX A[ , ]
        C(:,:) = A(:,:)
        !--- COPY INITIAL MATRIX B[ , ]
        D(:,:) = B(:,:)
        call cpu_time(START)
        WRITE(*,'(1X,A,F12.8)') 'start time:', START
        DO K = 1, N
            !--- INTRINSIC MATRIX MULTIPLY
            CALL MMult(C,D,NR,NC,E)
        END DO
        call cpu_time(FINISH)
        WRITE(*,'(1X,A2,I5,A1,I5,A2)') 'A[',SIZE(A,1),',',SIZE(A,2),' ]'
        WRITE(*,'(1X,A2,I5,A1,I5,A2)') 'B[',SIZE(B,1),',',SIZE(B,2),' ]'
        WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
        WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
        BEST = finish - start
        WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

        !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !--- MATMMULT( A, B )
        WRITE(*,'(1X,A)') 'MatMMult'
        WRITE(*,*) REPEAT('~',30)
        !--- COPY INITIAL MATRIX A[ , ]
        C(:,:) = A(:,:)
        !--- COPY INITIAL MATRIX B[ , ]
        D(:,:) = B(:,:)
        call cpu_time(START)
        WRITE(*,'(1X,A,F12.8)') 'start time:', START
        DO K = 1, N
            !--- CODED MATRIX MULTIPLY
            E(:,:) = 0.D0
            CALL MatMMult( C, D, NR, NC, E )
        END DO
        call cpu_time(FINISH)
        WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
        WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
        WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

        !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !--- MATDOTMULT( A, B )
        WRITE(*,'(1X,A)') 'MatDOTMult'
        WRITE(*,*) REPEAT('~',30)

        !--- COPY INITIAL MATRIX A[ , ]
        C(:,:) = A(:,:)
        !--- COPY INITIAL MATRIX B[ , ]
        D(:,:) = B(:,:)
        call cpu_time(START)
        WRITE(*,'(1X,A,F12.8)') 'start time:', START
        DO K = 1, N
            !--- CODED MATRIX MULTIPLY
            E(:,:) = 0.D0
            CALL MatDotMult( C, D, NR, NC, E )
        END DO
        call cpu_time(FINISH)
        WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
        WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
        WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

        !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !--- MATMULT( A, B )
        WRITE(*,'(1X,A)') 'MatMult'
        WRITE(*,*) REPEAT('~',30)
        !--- COPY INITIAL MATRIX A[ , ]
        C(:,:) = A(:,:)
        !--- COPY INITIAL MATRIX B[ , ]
        D(:,:) = B(:,:)
        call cpu_time(START)
        WRITE(*,'(1X,A,F12.8)') 'start time:', START
        DO K = 1, N
            !--- CODED MATRIX MULTIPLY
            E(:,:) = 0.D0
            CALL MatMult( C, D, NR, NC, E )
        END DO
        call cpu_time(FINISH)
        WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
        WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
        WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

        !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        !--- MATNMULT( A, B )
        WRITE(*,'(1X,A)') 'MatNMult'
        WRITE(*,*) REPEAT('~',30)
        !--- COPY INITIAL MATRIX A[ , ]
        C(:,:) = A(:,:)
        !--- COPY INITIAL MATRIX B[ , ]
        D(:,:) = B(:,:)
        call cpu_time(START)
        WRITE(*,'(1X,A,F12.8)') 'start time:', START
        DO K = 1, N
            !--- CODED MATRIX MULTIPLY
            E(:,:) = 0.D0
            CALL MatNMult( C, D, NR, NC, E )
        END DO
        call cpu_time(FINISH)
        WRITE(*,'(1X,A,F12.8)') 'finish time:', FINISH
        WRITE(*,'(1X,A,F12.8,A)') 'Run time = ', (finish-start)/N, ' seconds.'
        WRITE(*,'(1X,A18,F12.8/)') 'Relative run time:', (finish-start)/BEST

        !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        WRITE(*,*) REPEAT('~',30)
        WRITE(*,'(A//)') 'End of Run...'

        DEALLOCATE ( A, B, C, D, E )

        END PROGRAM MAIN

        !****************************************************************
        !       MATRIX MULTIPLY A x B SUBROUTINES
        !
        !****************************************************************
        SUBROUTINE MMult(A,B,M,N,C)
        !--- INTRINSIC MATRIX MULTIPLY
        !****************************************************************
        IMPLICIT NONE
        INTEGER*4, INTENT(IN)   :: M, N
        REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
        REAL*8, INTENT(OUT)     :: C(M,M)
        REAL*8 :: D
        ALLOCATABLE :: D(:,:)
        ALLOCATE ( D(M,M) )
        D(:,:) = 0.0D0
        D(:,:) = MATMUL( A, B )
        !--- COPY ZERO MATRIX D[] TO C[]
        C(:,:) = D(:,:)
        DEALLOCATE ( D )
        RETURN
        END SUBROUTINE MMult

        !****************************************************************
        SUBROUTINE MatMult(A,B,M,N,C)
        !--- Square matrix where m = n
        !--- A[m,n] x B[n,m] = C[m,m] using index order I, J, and K.
        !****************************************************************
        IMPLICIT NONE
        INTEGER*4, INTENT(IN)   :: M, N
        REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
        REAL*8, INTENT(OUT)     :: C(M,M)
        REAL*8 :: D
        INTEGER*4 :: I, J, K
        ALLOCATABLE :: D(:,:)
        ALLOCATE ( D(M,M) )
        D(:,:) = 0.0D0
        DO I = 1, M
            DO J = 1, M
                DO K = 1, N
                   D(I,J) = D(I,J) + A(I,K)*B(K,J)
                END DO
            END DO
        END DO
        !--- COPY ZERO MATRIX D[] TO C[]
        C(:,:) = D(:,:)
        DEALLOCATE ( D )
        RETURN
        END SUBROUTINE MATMult

        !****************************************************************
        SUBROUTINE MatNMult(A,B,M,N,C)
        !--- Multiiplies A = B*C using implicit K index in 2nd dimension
        !****************************************************************
        IMPLICIT NONE
        INTEGER*4, INTENT(IN)   :: M, N
        REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
        REAL*8, INTENT(OUT)     :: C(M,M)
        REAL*8 :: D
        INTEGER*4 :: I, J
        ALLOCATABLE :: D(:,:)
        ALLOCATE ( D(M,M) )
        D(:,:) = 0.0D0
        DO I = 1, M
            DO J = 1, N
                D(I,1:M) = D(I,1:M) + A(I,J)*B(J,1:M)
            END DO
        END DO
        !--- COPY ZERO MATRIX D[] TO C[]
        C(:,:) = D(:,:)
        DEALLOCATE ( D )
        RETURN
        END SUBROUTINE MatNMult

        !****************************************************************
        SUBROUTINE MatMMult(A,B,M,N,C)
        !--- Multiiplies A = B*C using implicit K index in 1st dimension
        !****************************************************************
        IMPLICIT NONE
        INTEGER*4, INTENT(IN)   :: M, N
        REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
        REAL*8, INTENT(OUT)     :: C(M,M)
        REAL*8 :: D
        INTEGER*4 :: I, J
        ALLOCATABLE :: D(:,:)
        ALLOCATE ( D(M,M) )
        D(:,:) = 0.0D0
        DO I = 1, M
            DO J = 1, N
                D(1:M,I) = D(1:M,I) + A(1:M,J)*B(J,I)
            END DO
        END DO
        !--- COPY ZERO MATRIX D[] TO C[]
        C(:,:) = D(:,:)
        DEALLOCATE ( D )
        RETURN
        END SUBROUTINE MatMMult

        !****************************************************************
        SUBROUTINE MatDotMult(A,B,M,N,C)
        !****************************************************************
        IMPLICIT NONE
        INTEGER*4, INTENT(IN)   :: M, N
        REAL*8, INTENT(IN)      :: A(M,N), B(N,M)
        REAL*8, INTENT(OUT)     :: C(M,M)
        REAL*8 :: D
        INTEGER*4 :: I, J
        ALLOCATABLE :: D(:,:)
        ALLOCATE ( D(M,M) )
        D(:,:) = 0.0D0
        DO I = 1, M
            DO J = 1, M
                D(I,J) = DOT_PRODUCT( A(I,1:N), B(1:N,J) )
            END DO
        END DO
        !--- COPY ZERO MATRIX D[] TO C[]
        C(:,:) = D(:,:)
        DEALLOCATE ( D )
        RETURN
        END SUBROUTINE MatDotMult
        !****************************************************************
        !       END OF MATRIX MULTIPLY SUBROUTINES
        !****************************************************************

Re: Matrix Multiply Speed Test

Frank,

Thank you for the interesting program!  I'll have to try it myself.  Our runtime library that is currently included is not particularly optimized to ensure portability, though we're looking at changing that.

You might get an additional speed boost by specifying the compiler flag

-fexternal-blas

and enabling LAPACK and BLAS in Project Options.  However, I'm not sure the gains would be particularly great.

Jeff Armstrong
Approximatrix, LLC