Square of Hilbert's Matrix


The aim of solving this problem is to learn how to use the XMM registers for multiplication of floating point numbers. Matrix multiplication is a slow calculation especially if the floating point unit is used, and hence doing packed floating point calculations (if double precision is not required) might just be much faster. So this program will test that.

The Hilbert's matrix is a simple matrix whose each element B(i, j) = 1/(i+j-1) . For more details refer the Wikipedia link .

The Hilbert's matrix is a square matrix, so first we will take the size of the matrix as input from the user and then allocate the memory block for the matrix and store the elements of the matrix. Then, we shall calculate the square of the matrix and print it on screen. The floating point precision level that will be used is single or 4 bytes per number.

%macro prologue 0
        push    rbp
        mov     rbp,rsp
        push    rbx
        push    r12
        push    r13
        push    r14
        push    r15
%endmacro
%macro epilogue 0
        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     rbx
        leave
        ret
%endmacro
section .rodata
    prompt1 db  "Enter the size of the Hilbert matrix:",0
    prompt2 db  "The square of the matrix is:",10,0 
    int_prompt db "%d",0
    float_prompt db "%f",10,0
section .text
    global  main, create_hilbert_matrix, print_float, square_float_matrix, print_float_matrix
    extern  printf, scanf, malloc, free
    extern  print_int,print_nl
    main:
        push    rbp
        mov     rbp, rsp
        sub     rsp, 4
        push    rbx
        push    r12
        push    r13
        push    r14
        push    r15
        ; Enter the size of the matrix
        xor     rax, rax
        mov     rdi, dword prompt1
        call    printf
        xor     rax, rax
        lea     rsi, [rbp-4]
        mov     rdi, dword int_prompt
        call    scanf

        ; Create a Hilbert Matrix
        xor     rax, rax
        xor     rdi, rdi
        mov     edi, [rbp-4]
        call    create_hilbert_matrix
        mov     rbx, rax              ; store a pointer to the Hilbert matrix      
        ; Calculate the Square of a floating point matrix
        ; Arguments are the size of the matrix and the pointer to the input matrix
        ; Return value is the pointer to the square of the input matrix
        mov     rsi, rax
        mov     edi, [rbp-4]
        call    square_float_matrix
        mov     r12, rax              ; store a pointer to the square of the Hilbert matrix
        
        ; Print the Hilbert Matrix and its square on the screen.
        mov     rsi, rbx
        mov     edi, [rbp-4]
        call    print_float_matrix
        mov     rsi, r12
        mov     edi, [rbp-4]
        call    print_float_matrix

        ; End the program and free memory
        mov     rdi, rbx
        xor     rax, rax
        call    free
        mov     rdi, r12
        xor     rax, rax
        call    free
        epilogue

    create_hilbert_matrix:
        prologue
        ; size of the matrix is in RDI
        mov     r15, rdi
        imul    rdi, rdi
        imul    rdi, 4          ; sizeof(float)
        ; size of the memory to be allocated in RDI now
        xor     rax, rax
        call    malloc
        ; store a copy of the memory location pointing to the matrix.
        mov     rbx, rax
        mov     r12, rax
        xor     rcx, rcx
outer_loop_start:
        cmp     rcx, r15
        jge     outer_loop_end
        xor     rdx, rdx
inner_loop_start:
        cmp     rdx, r15
        jge     inner_loop_end

        ; each element is RCX+RDX+1
        mov     rdi, 1
        add     rdi, rcx
        adc     rdi, rdx
      
        ; convert the integer in RDI to a single precision floating point in XMM0
        cvtsi2ss xmm0, edi
        ; calculate the reciprocal of the single precision floating point in XMM0 and place it in XMM1
        rcpss   xmm1, xmm0
        ; place the value in XMM1 in memory allocated for the matrix
        movd    dword[r12], xmm1
        add     r12, 4          ; sizeof(float) 

        inc     rdx
        jmp     inner_loop_start
inner_loop_end:
        inc     rcx
        jmp     outer_loop_start
outer_loop_end:
        mov     rax, rbx
        epilogue

square_float_matrix:
         prologue
        ; the Hilbert matrix pointer is in RSI
        mov     r14, rsi
        ; size of the matrix is in RDI
        mov     r15, rdi
        imul    rdi, rdi
        imul    rdi, 4          ; sizeof(float)
        ; size of the memory to be allocated in RDI now
        xor     rax, rax
        call    malloc
        ; store a copy of the memory location pointing to the matrix.
        mov     rbx, rax
        mov     r12, rax
        ; start squaring the matrix
        xor     rcx, rcx
forloop_1:
        cmp     rcx, r15
        jge     end_forloop_1
        xor     rsi, rsi
forloop_2:
        cmp     rsi, r15
        jge     end_forloop_2
        xor     rdi, rdi
        ; B[i][j] = 0.0 in XMM0
        cvtsi2ss xmm0, rdi
forloop_3:
        cmp     rdi, r15
        jge     end_forloop_3
        ; B[i][j]+= A[i][k]*A[k][j]
        mov     rax, r15
        imul    rax, rcx
        add     rax, rdi
        mov     r8, r15
        imul    r8, rdi
        add     r8, rsi
        movd    xmm1, dword[r14 + rax*4]
        movd    xmm2, dword[r14 + r8*4]
        mulss   xmm1, xmm2
        addss   xmm0, xmm1
        inc     rdi
        jmp     forloop_3
end_forloop_3:
        ; B[i][j] = XMM0
        movd    dword [r12], xmm0
        add     r12, 4 ; sizeof(float)
        inc     rsi
        jmp     forloop_2
end_forloop_2:
        inc     rcx
        jmp     forloop_1
end_forloop_1:
        mov     rax, rbx
        epilogue


print_float_matrix:
        prologue
        ; size of the matrix is in RDI and pointer is in RSI
        mov     r15, rdi
        mov     r14, rsi
        xor     r12, r12
floop_1:
        cmp     r12, r15
        jge     end_floop_1
        xor     r13, r13
floop_2:
        cmp     r13, r15
        jge     end_floop_2
        ; Loading value to be printed in XMM0
        movd    xmm0, dword[r14]
        add     r14,4
        call    print_float
        inc     r13
        jmp     floop_2
end_floop_2:
        call    print_nl
        inc     r12
        jmp     floop_1
end_floop_1:
        xor     rax, rax
        epilogue

print_float:
        prologue
        ; The float is in XMM0
        mov     rdi, dword float_prompt
        mov     rax,1
        call    printf
        epilogue
				



Tweet


Follow @_vicash_