Square of Hilbert's Matrix

The aim of solving this problem is to learn how to use the `XMM` registers for multiplication of floating point numbers. Matrix multiplication is a slow calculation especially if the floating point unit is used, and hence doing packed floating point calculations (if double precision is not required) might just be much faster. So this program will test that.

The Hilbert's matrix is a simple matrix whose each element ```B(i, j) = 1/(i+j-1)``` . For more details refer the Wikipedia link .

The Hilbert's matrix is a square matrix, so first we will take the size of the matrix as input from the user and then allocate the memory block for the matrix and store the elements of the matrix. Then, we shall calculate the square of the matrix and print it on screen. The floating point precision level that will be used is single or 4 bytes per number.

```%macro prologue 0
push    rbp
mov     rbp,rsp
push    rbx
push    r12
push    r13
push    r14
push    r15
%endmacro
%macro epilogue 0
pop     r15
pop     r14
pop     r13
pop     r12
pop     rbx
leave
ret
%endmacro
section .rodata
prompt1 db  "Enter the size of the Hilbert matrix:",0
prompt2 db  "The square of the matrix is:",10,0
int_prompt db "%d",0
float_prompt db "%f",10,0
section .text
global  main, create_hilbert_matrix, print_float, square_float_matrix, print_float_matrix
extern  printf, scanf, malloc, free
extern  print_int,print_nl
main:
push    rbp
mov     rbp, rsp
sub     rsp, 4
push    rbx
push    r12
push    r13
push    r14
push    r15
; Enter the size of the matrix
xor     rax, rax
mov     rdi, dword prompt1
call    printf
xor     rax, rax
lea     rsi, [rbp-4]
mov     rdi, dword int_prompt
call    scanf

; Create a Hilbert Matrix
xor     rax, rax
xor     rdi, rdi
mov     edi, [rbp-4]
call    create_hilbert_matrix
mov     rbx, rax              ; store a pointer to the Hilbert matrix
; Calculate the Square of a floating point matrix
; Arguments are the size of the matrix and the pointer to the input matrix
; Return value is the pointer to the square of the input matrix
mov     rsi, rax
mov     edi, [rbp-4]
call    square_float_matrix
mov     r12, rax              ; store a pointer to the square of the Hilbert matrix

; Print the Hilbert Matrix and its square on the screen.
mov     rsi, rbx
mov     edi, [rbp-4]
call    print_float_matrix
mov     rsi, r12
mov     edi, [rbp-4]
call    print_float_matrix

; End the program and free memory
mov     rdi, rbx
xor     rax, rax
call    free
mov     rdi, r12
xor     rax, rax
call    free
epilogue

create_hilbert_matrix:
prologue
; size of the matrix is in RDI
mov     r15, rdi
imul    rdi, rdi
imul    rdi, 4          ; sizeof(float)
; size of the memory to be allocated in RDI now
xor     rax, rax
call    malloc
; store a copy of the memory location pointing to the matrix.
mov     rbx, rax
mov     r12, rax
xor     rcx, rcx
outer_loop_start:
cmp     rcx, r15
jge     outer_loop_end
xor     rdx, rdx
inner_loop_start:
cmp     rdx, r15
jge     inner_loop_end

; each element is RCX+RDX+1
mov     rdi, 1

; convert the integer in RDI to a single precision floating point in XMM0
cvtsi2ss xmm0, edi
; calculate the reciprocal of the single precision floating point in XMM0 and place it in XMM1
rcpss   xmm1, xmm0
; place the value in XMM1 in memory allocated for the matrix
movd    dword[r12], xmm1
add     r12, 4          ; sizeof(float)

inc     rdx
jmp     inner_loop_start
inner_loop_end:
inc     rcx
jmp     outer_loop_start
outer_loop_end:
mov     rax, rbx
epilogue

square_float_matrix:
prologue
; the Hilbert matrix pointer is in RSI
mov     r14, rsi
; size of the matrix is in RDI
mov     r15, rdi
imul    rdi, rdi
imul    rdi, 4          ; sizeof(float)
; size of the memory to be allocated in RDI now
xor     rax, rax
call    malloc
; store a copy of the memory location pointing to the matrix.
mov     rbx, rax
mov     r12, rax
; start squaring the matrix
xor     rcx, rcx
forloop_1:
cmp     rcx, r15
jge     end_forloop_1
xor     rsi, rsi
forloop_2:
cmp     rsi, r15
jge     end_forloop_2
xor     rdi, rdi
; B[i][j] = 0.0 in XMM0
cvtsi2ss xmm0, rdi
forloop_3:
cmp     rdi, r15
jge     end_forloop_3
; B[i][j]+= A[i][k]*A[k][j]
mov     rax, r15
imul    rax, rcx
mov     r8, r15
imul    r8, rdi
movd    xmm1, dword[r14 + rax*4]
movd    xmm2, dword[r14 + r8*4]
mulss   xmm1, xmm2
inc     rdi
jmp     forloop_3
end_forloop_3:
; B[i][j] = XMM0
movd    dword [r12], xmm0
add     r12, 4 ; sizeof(float)
inc     rsi
jmp     forloop_2
end_forloop_2:
inc     rcx
jmp     forloop_1
end_forloop_1:
mov     rax, rbx
epilogue

print_float_matrix:
prologue
; size of the matrix is in RDI and pointer is in RSI
mov     r15, rdi
mov     r14, rsi
xor     r12, r12
floop_1:
cmp     r12, r15
jge     end_floop_1
xor     r13, r13
floop_2:
cmp     r13, r15
jge     end_floop_2
; Loading value to be printed in XMM0
movd    xmm0, dword[r14]
call    print_float
inc     r13
jmp     floop_2
end_floop_2:
call    print_nl
inc     r12
jmp     floop_1
end_floop_1:
xor     rax, rax
epilogue

print_float:
prologue
; The float is in XMM0
mov     rdi, dword float_prompt
mov     rax,1
call    printf
epilogue
``` 