The regular JMP
and conditional Jcc
jump instructions change the course of
working code, the latter based on the runtime status of certain bits in the RFLAGS
register. The x86 and x86-64 processors implement pipelining of
instructions where they prefetch a certain number of instructions and evaluate
them before time. The number of instructions prefetched is dependent on the
prefetch input queue (PIQ).
The processor predicts the branch it has to
take and executes instructions along that branch. Sometimes the prediction is
correct, and sometimes it is not. Whenever the prediction fails, the processor
has to re-evaluate the instructions on the other branch. This slows down the
processing by a few clock cycles. This can be gotten around by replacing the
jump instructions in some cases with set byte on condition SETcc
type instructions.
The SETcc
instructions set the value in the register or memory operand to either
0
or 1
based on the values in the ZF
, OF
, SF
, CF
and PF
bits in the RFLAGS
register. They can be used after CMP
instructions, because CMP
sets the afore mentioned bits in the RFLAGS
register. They can also be used to check overflow caused in arithmetic operations.
Although a jump instruction leads to smaller bytecode, but if the processor
predicts the wrong branch direction, then it might lead to slower execution. On
the other hand, using SETcc
type instructions might lead to more bytecode but
atleast the processor can pipeline most of the instructions and avoid processing
time delays due to branch prediction. The branch instructions are intutively
easier to understand and use for a programmer and to imitate the branch
instructions using SETcc
type instructions is not a trivial task.
Below is a program, called setcc.asm
, that demonstrates how to get the greater
value between two long integers.
section .rodata
prompt1 db "Enter a number:",0
prompt2 db "Enter another number:",0
prompt3 db "The greater of %ld and %ld is %ld.",10,0
num_format db "%ld",0
section .text
global main
extern printf, scanf
main:
; prologue
push rbp
mov rbp, rsp
sub rsp, 16 ; We read in two long integers on the stack
push rbx
push r12
push r13
push r14
push r15
pushfq
; read in input1
mov rdi, dword prompt1
xor rax, rax
call printf
lea rsi, [rbp-8]
mov rdi, dword num_format
xor rax, rax
call scanf
; read in input2
mov rdi, dword prompt2
xor rax, rax
call printf
lea rsi, [rbp-16]
mov rdi, dword num_format
xor rax, rax
call scanf
; is input2 > input1 ?
xor rbx, rbx ; RBX = 0x0
mov rax, [rbp-16] ; place input2 in RAX
cmp rax, [rbp-8] ; is input2 > input1 ?
; SETG can only set a byte. So BL is used.
setg bl ; RBX = input2 > input1 ? 0x1 : 0x0
neg rbx ; RBX = input2 > input1 ? 0xFFFFFFFFFFFFFFFF : 0x0
mov rcx, rbx ; RCX = input2 > input1 ? 0xFFFFFFFFFFFFFFFF : 0x0
and rcx, [rbp-16] ; RCX = (input2 > input1 ? 0xFFFFFFFFFFFFFFFF: 0x0) && input2 ? input2 : 0x0
not rbx ; RBX = input2 > input1 ? 0x0 : 0xFFFFFFFFFFFFFFFF
and rbx, [rbp-8] ; RBX = (input2 > input1 ? 0x0 : 0xFFFFFFFFFFFFFFFF) && input1 ? 0x0 : input1
or rcx, rbx ; RCX = input2 > input1 ? input2 || 0x0 : 0x0 || input1 = input2 : input1
; RCX contains the greater value
mov rdx, [rbp-16]
mov rsi, [rbp-8]
mov rdi, dword prompt3
xor rax, rax
call printf
; epilogue
popfq
pop r15
pop r14
pop r13
pop r12
pop rbx
add rsp, 16 ; set back the stack level
leave
ret
To link and compile the above program we do the following:
$ yasm -f elf64 setcc.asm
$ ld -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 \
/usr/lib/x86_64-linux-gnu/crt1.o /usr/lib/x86_64-linux-gnu/crti.o \
setcc.o /usr/lib/x86_64-linux-gnu/crtn.o -lc -o setcc.out
Download setcc.asm.