Related
I am trying to link 2 files. There are 3 symbols (Assembly procedures) which I am trying to link. It worked perfectly with static linking, but when trying to do it dynamically - I receive an error.
/usr/bin/ld: warning: type and size of dynamic symbol `parse_intro' are not defined
/usr/bin/ld: warning: type and size of dynamic symbol `time_to_print' are not defined
/usr/bin/ld: warning: type and size of dynamic symbol `optimizing' are not defined
First, I am making a file into a shared library, from which I want to export 3 symbols. Code of this file:
.data
premsg:
.ascii "0x" #to make the number have distinct x16 look when printed, this will be used as a prefix before a number
msg: #placeholder msg label, reserved memory will be used to store parsed number
.space 16 #16 bytes for 32 ascii numbers
nextline:
.ascii "\n" #switching to the next line after 16 ascii numbers
var1:
.quad 0x00000000000ef12b #number which will be parsed
.global parse_intro
.global optimizing
.global time_to_print
.text
parse_intro:
#binary mask for cutting smallest 4 bits (single number) from the 8 byte number
mov $0b0000000000000000000000000000000000000000000000000000000000001111, %r9
mov $0x1, %rbx #multiplier that will be used to shift from first 4 bits to proper position in the ascii number
mov $0x0, %r12 #register that will hold first half of the reversed number
mov $0x0, %r14 #register that will hold second half of the reversed number
jmp parse_start
parse_start: #main body of the parsing number to ASCII procedure
mov %r8, %rax #making a copy of a number before shifting it
shr $4, %r8 #shifting original number to 4 numbers right to go to the next number on the next step
and %r9, %rax #taking last 4 bits from a number
cmp $0xa, %rax #checking if a number is smaller than 0xa
jl zero_to_9 #jumping to do +30 procedure if it's not x16 numeric symbol
add $0x57, %rax #adding 57 because to change a number to ASCII number from a to f you need to add x16 57
mul %rbx
mul %rbx #we multiplying it twice because we go from 8 bytes to 16 bytes
add %rax, %r12 #adding summed with 30 and increased twice by rbx coefficient piece of the initial number to the previous pieces of a number to put it together in ASCII
shl $4, %rbx #shifting rbx 4 bits to the left to increase it by mul 10 without rax shenanigans
jmp check_for_overflow #going to overflow check after adding 8 bytes to %r12
zero_to_9: #function to change from bit number to ascii number
add $0x30, %rax #adding 30 because to change a number to ASCII number from 0 to 9 you need to add x16 30
mul %rbx
mul %rbx #we multiplying it twice because we go from 8 bytes to 16 bytes
add %rax, %r12 #adding summed with 30 and increased twice by rbx coefficient piece of the initial number to the previous pieces of a number to put it together in ASCII
shl $4, %rbx #shifting rbx 4 bits to the left to increase it by mul 10 without rax shenanigans
jmp check_for_overflow #going to overflow check after adding 8 bytes to %r12
check_for_overflow: #function to store half of the value inside a second register, because it goes from 8 bytes to 16 bytes when parsed to ASCII
mov $0x1000000000000000, %rax #value for overflow check
cmp %rax, %r12 #checking if r12 is overflowed
jle looping #doing a loop to parse_start with retq if loop ends in case if %r12 is not going to get overflow
cmp $0x1, %rcx #checking if count is 1 and then
je finishing #Finishing to avoid moving %r12 to %r14 another time
mov %r12, %r14 #moving %r12 value to store in %r14
xor %r12, %r12 #resetting %r12
mov $0x1, %rbx #resetting rbx to start from the first position
jmp looping #doing a loop to parse_start with retq if loop ends
looping:
loop parse_start
retq #exiting back to start if loops end, otherwise back to parse_start and decrementing %rcx
optimizing:
#binary mask for cutting lower 4 bytes out in optimization routine
mov $0b1111111111111111111111111111111100000000000000000000000000000000, %rbx
mov $16, %rcx #the counter
movq %r8, %rax #moving the initial number value to accumulator register %rax
andq %rbx, %rax #leaving only first 32 not as 0 to check if biggest part of the number is full of 0 or not
cmp $0, %rax #checking if number is full of 0
jne finishing #first (from the left) 32 bits aren't full of 0 so we can't ignore them
subq $8, %rcx #second half of a number is full of 0, so we can only count from 8 instead of 16
retq #returning to _start
finishing:
retq #little function to jump return
time_to_print:
mov $0b1111111100000000000000000000000000000000000000000000000000000000, %r9
cmp $0, %r14 #checking if we skipped 4 bytes because the original had zeroes in the biggest part of the number
je skipped_bytes #jumping to function that will swap r14 and r12, so that lower part is kept in r14 and r12 is full of zeroes (0x30 in ASCII)
mov $56, %rcx #making a counter which would stop the jump loop which would also work as a shift left value
call byte_fun #calling the reversing procedure
mov %r14, %r12 #moving second half of the number to r12
mov %rax, %r14 #storing the first half of the number from the accumulator having the value after completing byte_fun to %r14
mov $56, %rcx #we are putting 56 and not 64 because the last step will be made after the loop to avoid additional actions
call byte_fun #reversing the second half of a number
mov %rax, %r12 #moving stored second half of the number to the different register
lea msg(%rip), %rax #storing the address of the msg inside %rax
mov %r14, (%rax) #Putting on the first half of the reserved memory by msg, linked to %rax through previous instruction, the first half of the number needed to display formatted to ASCII saying mov value (rax) makes you move the value to the address stored in the register
add $8, %rax #adding 8 to address the second half of the reserved memory in msg
mov %r12, (%rax) #putting the second half of the reversed number to an address of the last 8 bytes resrved by msg
mov $1, %rax #putting 1 to %rax for printing syscall
mov $1, %rdi #puttin 1 to %rdi for printing syscall
lea msg(%rip), %rsi #taking a position independent link to the msg label with numbers related to ascii data and putting it to a printing register %rsi
mov $16, %rdx #setting length of 16 bits (2 per 1 number) and 1 bit for newline \n char
syscall #syscalling the print with the proper number
retq #returning to the _start body
byte_fun: #start of the reverse function to save the entry point
pop %rbp #putting entry point to the %rbp register to successfully return after a few jumping back and forth
jmp byte_reverse #jumping to the main body of the reverse function
byte_reverse: #function which separates the highest 2 bits from the reversed number and then shifts the reversed number by 8 bits (2 numbers) to the left and then switches 8 bits to their proper position and pushes them to a stack.
mov %r12, %rax #putting a copy of the currently shifted reversed number to an accumulator register %rax
shl $8, %r12 #shifting reversed number to the left for the next loop cycle
and %r9, %rax #applying binary mask which will only leave 8 bytes (or 2 numbers)
shr %cl, %rax #shifting current 8 bits of a reversed number right, prior to %rcx count, to reverse their position
push %rax #pushing a shifted piece of a number to a stack
sub $8, %rcx #substracting 8 from count to represent a shifting of the next 8 bits
jne byte_reverse #if sub from %rcx not resulted in zero - we are looping
push %r12 #pushing last 2 numbers of the reversed number, shifted to the left, as the final piece of the reversed value, which will be accessed first from stack to start the reverse
mov $8, %rcx #switching count register to 8 for a future loop inside byte_back
xor %rax, %rax #cleaning up the accumulator to 0
jmp byte_back #jumping to a code which put 8 elements inside stack all together and puts them back to stack as a single entity
byte_back: #function which sums elements in the stack to get the reversed version of the number
pop %rbx #poppint highest stack element to a register
add %rbx, %rax #adding the highest element to an accumulator
loop byte_back #decrementing the %rcx counter and starting at byte back again
push %rbp #we exited the loop and now putting the address of the print procedure to the top of the stack to get back to a procedure after calling the byte_fun
retq #returning to print procedure
skipped_bytes: #function to avoid issues with printing when only 4 bytes of the original number were evaluated in parsing
#I put this print of "0x" to avoid _start code because I want to use this program as a library
mov $1, %rax #putting 1 to %rax for print syscall
mov $1, %rdi #putting 1 to %rdi for print syscall
lea premsg(%rip), %rsi #putting a link to "0x" ascii value to %rsi to print it
mov $2, %rdx #setting 2 bytes to display 2 characters
syscall #syscall for printing "0x"
mov %r12, %r14 #moving the first half of the reversed number to a register that will be pushed to a stack first, so it would be last when we get it back
mov $0x3030303030303030, %r12 #changing a second half or reversed number to ascii zeroes
jmp time_to_print #we are ready to start the printing procedure
I make this program into .so file by using a command:
gcc printing.s -shared -o libprint.so
Then, I use these 3 procedures from this .so (parse_intro, time_to_print, and optimizing)in the main file, which has this code
.data
linked_space: #space reserved for linked list nodes
.space 0x3000
list_head: #default list_head value
.quad 0x0
.quad 0x0
opening_bracket: #part of printing function construct
.ascii "["
straight_line: #part of printing function construct
.ascii "|"
closing_part: #part of printing function construct
.ascii "] -> "
last_part: #part of printing function construct, representing empty "first" node
.ascii "[empty|node]\n"
cut_error_text:
.ascii "Error: can't cut a core node\n"
.global _start
.text
add_head: #function which adds new element as head and makes a link to a previous head element
mov list_head(%rip), %rax #moving contents of a label (link to a head node) to a register
add $16, %rax #Moving the link to a point where a new node will start (1 node is 16 bytes)
mov %rbx, (%rax) #Putting a value that we want to hold in a new node inside the value address of a new head node
add $8, %rax #Moving address inside register by 8 to put a link to a previous head
mov list_head(%rip), %rdx #Moving old head address to an %rdx to put it then inside the new head
mov %rdx, (%rax) #Putting link of the old head inside a node of a new head
add $16, list_head(%rip) #Changing label which points to the head element to the new node we created
retq
cut_head: #Function which cuts the head by shifting the label 16 bytes back, with exception check to avoid touching core node
mov list_head(%rip), %rax #Putting link of the current head element which we will cutaway
add $8, %rax #Getting a link to a previous element to check if it's 0x0, which means it's a core node
cmp $0, (%rax) #Comparing it to 0
je cut_error #Jumping to error version of the cut if it's equal
sub $24, %rax #Moving to the beginning of the previous element, 16 bytes + 8 after previous add 8
mov %rax, list_head(%rip) #Moving new link to a head label
retq
cut_error: #Function which prints error text and avoid cutting the core element
mov $1, %rax #Printing error text
mov $1, %rdi
lea cut_error_text(%rip), %rsi
mov $29, %rdx
syscall
retq
pre_print: #Intro for print to put first head element, to avoid issues with loop shenanigans
push list_head(%rip)
jmp print_node #Moving to main print function
print_node: #Function which takes functions from printing code and prints linked list visually
pop %rbx #Putting stored link to node value into %rbx
push %rbx #Pushing back to a stack a link to avoid mutations of a link after working with %rbx
add $8, %rbx #Moving link to the address part of the node with the address to a previous list
mov (%rbx), %rax #Putting actual address to the previous node to check if it's 0x0, which means, it's a first empty node
cmp $0, %rax #Comparing link inside %rax to 0
je return_printing #If it's zero - moving to the final part of the print, where the first empty node is printed with \n char
mov $1, %rax #Printing opening bracket
mov $1, %rdi
lea opening_bracket(%rip), %rsi
mov $1, %rdx
syscall
pop %rbx #Getting link to the value field of the current node
mov (%rbx), %r8 #Moving it to %r8, which is a register that will contain a numeric value for future printing
push %rbx #Saving link stored inside %rbx because %rbx will be used in printing functions
call optimizing #Printing routine consists of 3 functions which need to be called from printing part of the program
call parse_intro
call time_to_print
mov $1, %rax #Printing straight line to separate value from link
mov $1, %rdi
lea straight_line(%rip), %rsi
mov $1, %rdx
syscall
pop %rbx #Putting link back from the stack again in rbx
push %rbx #Storing the link before mutating it again
add $8, %rbx #Changing link to the address which has a link to the previous element
mov (%rbx), %r8 #Moving link to %r8 to print it
call optimizing #Launching a print routine
call parse_intro
call time_to_print
mov $1, %rax #Printing closing part of the node "construct"
mov $1, %rdi
lea closing_part(%rip), %rsi
mov $5, %rdx
syscall
pop %rbx #Popping link of the printed node to shift it 16 bytes back to move to the previous node
sub $16, %rbx #Moving to the previous node
push %rbx #Putting a link to the previous node inside stack to use it later
jmp print_node #Looping back to print_node
return_printing: #Finalizing function which will print the first empty node and \n char
mov $1, %rax
mov $1, %rdi
lea last_part(%rip), %rsi
mov $13, %rdx
syscall
pop %rbx #Since we didn't pop the stored value in loop body, we need to get rid of it from stack to jump back to _start
retq
list_initialization:
lea linked_space(%rip), %rax #initializing the first node, it already has 0x0 as value and it's needed to put on the space for the linked list
mov %rax, list_head(%rip) #Putting link to the list_head, which currently contains the first node value/link 0x0, on the linked_space
retq
_start:
call list_initialization
#call pre_print #Function which prints the linked list
#mov $1, %rbx #%rbx will hold value which will be put inside new linked list node
#call add_head #Function which adds new node as head element
#call pre_print
#mov $2, %rbx
#call add_head
#call pre_print
#mov $3, %rbx
#call add_head
#call pre_print
#call cut_head #Function which cuts head element and moves link to a previous element
#call pre_print
#call cut_head
#call pre_print
#call cut_head
#call pre_print
#call cut_head
#call pre_print
mov $60, %rax #Exiting from a program
xor %rdi, %rdi
syscall
And compile it into the binary file, which should be able to run, by command:
gcc -L /home/*path_to_folder_with_so_file* -g -nostdlib -o output linkedlist.s -lprint
I also tried to compile the same file but commented out 6 lines where the program tries to access files from a dynamically shared library. Here is the "readelf" contents of this file.
https://pastebin.com/WUQz09K2
And readelf of .so file:
https://pastebin.com/1F74euqP
What am I potentially missing, why the original file can't find imported symbols from .so?
Since I can't put a comment as a solution, so, I will put an answer myself, which was given by fuz.
The important thing to do here was to pay attention to the error message (I know, how obvious). Every dynamically linked symbol in GAS assembly, even without C libraries, requires to have a type and size assigned inside the .so file, because GCC can't get this information about exported symbols on its own, and the programmer needs to explicitly give it. For example, for a function foo
foo:
...
ret
You need to give a function a type by putting
.type foo, #function
somewhere in the code, I did it on the next line after the
.global foo
line where I make the same function global, allowing it to be exported.
And, to give a size to the dynamic symbol, you need to put the
.size foo, .-foo
right after the last instruction (like, right after the "jmp" or "ret"). "Dot" is the current address and the foo is the address of the first instruction inside the "foo" function. So, by subtracting an address of "foo" out of the current address you are getting the size of a "foo".
I've written some assembly code on making a counter to count the length of a string.
The string is -123.
I'm having just the one issue:
My negative check (cmp %r15, %r14 / je Negative_counter) is being bypassed even if I have a negative integer
.data
S: .string "123"
Result: .quad
.text
.globl main
main:
mov $S,%rdx #Storage of string
mov $S,%rbx
mov Result, %rax #Location of result storage
mov $10, %r8
mov $1, %r11 #-1 counter creation with 2s complement
not %r11 #negation of 1
add $1, %r11 #2's complement complete
mov $1, %r12 #-1 counter creation with 2s complement
not %r12 #negation of 1
add $1, %r12 #2's complement complete, -1 established
#R[rbx] is used here.
Loop1: #loop string from end to beginning
cmp $0, (%rbx) #compare base addresss value with null
je Counter_Made #if null, branch to end loop.
add $1, %r11 #increment %r11 by 1 for each digit thats not null (creates counter for 2nd loop)
add $1, %rbx #Next string digit
jmp Loop1 #reinitiate loop
#Counter of string made -149, would given counter value of 3
#R[rdx] and r14 is used here.
Counter_Made:
cmp $0,%r11 #check if %r11 is zero
je Output #End program, output null result
mov $S, %r14 #move into register 14
sub $7, %r14 #Shift to least significant bit
mov $13, %r15
and $15, %r15
cmp %r15, %r14 #Determine negativity/positivity of integer, if <0 value is negative
je Negative_counter
jmp Positive_loop
Positive_loop:
cmp %r12,%r11 #End of loop check
je Output #Store result if loop end condition satisfied
mov (%rdx), %r10 #grab first byte in address string
sub $30,(%rdx) #Conversion from 8bitASCII to 2Bit Binary
and $15, %r10 #initialize size to match
Positive_inner_loop:
mov %r11, %r9
cmp $0, %r9 #Compare loop length with 0 to see if it needs multiplication
je InnerLoopDone #Jump to inner loop done once length = 0
imul %r8, %r10 #Place holder multiplication
InnerLoopDone:
add %r10,%rax
sub $1, %r11 #Decrease Length to grab next ten multiplication place holder position
mov 1(%rdx), %rdx #next digit position
jmp Positive_loop
Negative_counter:
add $1,%rdx
jmp Negative_loop
Negative_loop:
cmp %r12,%r11
je Negative_Complement
jmp Negative_loop
Negative_Complement:
not %rdx #Convert to 2's complement with negation and then + 1
add %r14,%rdx
jmp Output
Output:
ret
I think you're talking about this block of code. I've re-commented it with less useless comments. e.g. move into register 14 doesn't tell you anything you can't tell from the mov $S, %r14 instruction itself. Comments should explain what's going on in the algorithm. Assume that the person reading the comments has a copy of the instruction reference manual available, so only comment on the mechanical details if you're doing something non-obvious. (Like using a flag that's still set from a few instructions ago).
mov $S, %r14 # r14 = pointer to the start of the string
sub $7, %r14 # r14 = pointer to 7 bytes before the beginning of the string
mov $13, %r15
and $15, %r15 # r15 = 13 & 0xF = 13
cmp %r15, %r14 #
je Negative_counter # jump if (S-7) == 13
# jmp Positive_loop # this is totally redundant, you don't need a jmp to jump over the blank line before the next block of code.
Positive_loop:
Clearly S-7 (i.e. &S[-7] in C syntax) is never going to equal 13, because addresses of things in the .data or .rodata section will never be that close to 0 on Linux.
You could have easily seen this with a debugger, by setting a breakpoint or single-stepping until you got to the cmp/je and looking at the contents of those regs.
See the bottom of the x86 tag wiki for a quick explanation of putting gdb into layout reg mode where it shows the register values as you single-step.
There are probably a lot of other things wrong with your code, too, but it's long and I didn't read it all.
I want to print the value in %RCX directly to the console, let's say an ASCII value. I've searched through some wise books and tutorials, but all use buffers to pass anything. Is it possible to print anything without creating special buffer for that purpose?
lets say i am here (all this answers are fat too complicated to me and use different syntax):
movq $5, %rax
...???(print %rax)
Output on console:
\>5
in example, to print buffer i use code:
SYSWRITE = 4
STDOUT = 1
EXIT_SUCCESS = 0
.text
buff: .ascii "Anything to print\n"
buff_len = . - buff
movq $SYSWRITE, %eax
mov $STDOUT, %ebx
mov $buff, %ecx
mov $buff_len, %edx
NO C CODE OR DIFFERENT ASS SYNTAX ALLOWED!!!
In order to print a register (in hex representation or numeric) the routine (write to stdout, stderr, etc.) expects ASCII characters. Just writing a register will cause the routine to try an display the ascii equivalent of the value in the register. You may get lucky sometimes if each of the bytes in the register happen to fall into the printable character range.
You will need to convert it vis-a-vis routines that convert to decimal or hex. Here is an example of converting a 64 bit register to the hex representation (using intel syntax w/nasm):
section .rodata
hex_xlat: db "0123456789abcdef"
section .text
; Called with RDI is the register to convert and
; RSI for the buffer to fill
;
register_to_hex:
push rsi ; Save for return
xor eax,eax
mov ecx, 16 ; looper
lea rdx, [rel hex_xlat] ; position-independent code can't index a static array directly
ALIGN 16
.loop:
rol rdi, 4 ; dil now has high bit nibble
mov al, dil ; capture low nibble
and al, 0x0f
mov al, byte [rdx+rax] ; look up the ASCII encoding for the hex digit
; rax is an 'index' with range 0x0 - 0xf.
; The upper bytes of rax are still zero from xor
mov byte [rsi], al ; store in print buffer
inc rsi ; position next pointer
dec ecx
jnz .loop
.exit:
pop rax ; Get original buffer pointer
ret
This answer is an addendum to the answer given by Frank, and utilizes the mechanism used there to do the conversion.
You mention the register %RCX in your question. This suggests you are looking at 64-bit code and that your environment is likely GCC/GAS (GNU Assembler) based since % is usually the AT&T style prefix for registers.
With that in mind I've created a quick and dirty macro that can be used inline anywhere you need to print a 64-bit register, 64-bit memory operand, or a 32-bit immediate value in GNU Assembly. This version was a proof of concept and could be amended to support 64 bit immediate values. All the registers that are used are preserved, and the code will also account for the Linux 64-bit System V ABI red zone.
The code below is commented to point out what is occurring at each step.
printmac.inc:
.macro memreg_to_hex src # Macro takes one input
# src = memory operand, register,
# or 32 bit constant to print
# Define the translation table only once for the current object
.ifndef MEMREG_TO_HEX_NOT_FIRST
.set MEMREG_TO_HEX_NOT_FIRST, 1
.PushSection .rodata
hex_xlat: .ascii "0123456789abcdef"
.PopSection
.endif
add $-128,%rsp # Avoid 128 byte red zone
push %rsi # Save all registers that will be used
push %rdi
push %rdx
push %rcx
push %rbx
push %rax
push %r11 # R11 is destroyed by SYSCALL
mov \src, %rdi # Move src value to RDI for processing
# Output buffer on stack at ESP-16 to ESP-1
lea -16(%rsp),%rsi # RSI = output buffer on stack
lea hex_xlat(%rip), %rdx # RDX = translation buffer address
xor %eax,%eax # RAX = Index into translation array
mov $16,%ecx # 16 nibbles to print
.align 16
1:
rol $4,%rdi # rotate high nibble to low nibble
mov %dil,%al # dil now has previous high nibble
and $0xf,%al # mask off all but low nibble
mov (%rdx,%rax,1),%al # Lookup in translation table
mov %al,(%rsi) # Store in output buffer
inc %rsi # Update output buffer address
dec %ecx
jne 1b # Loop until counter is 0
mov $1,%eax # Syscall 1 = sys_write
mov %eax,%edi # EDI = 1 = STDIN
mov $16,%edx # EDX = Number of chars to print
sub %rdx,%rsi # RSI = beginning of output buffer
syscall
pop %r11 # Restore all registers used
pop %rax
pop %rbx
pop %rcx
pop %rdx
pop %rdi
pop %rsi
sub $-128,%rsp # Restore stack
.endm
printtest.s
.include "printmac.inc"
.global main
.text
main:
mov $0x123456789abcdef,%rcx
memreg_to_hex %rcx # Print the 64-bit value 0x123456789abcdef
memreg_to_hex %rsp # Print address containing ret pointer
memreg_to_hex (%rsp) # Print return pointer
memreg_to_hex $0x402 # Doesn't support 64-bit immediates
# but can print anything that fits a DWORD
retq
This can be compiled and linked with:
gcc -m64 printtest.s -o printtest
The macro doesn't print an end of line character so the output of the test program looks like:
0123456789abcdef00007fff5283d74000007f5c4a080a500000000000000402
The memory addresses will be be different.
Since the macros are inlined, each time you invoke the macro the entire code will be emitted. The code is space inefficient. The bulk of the code could be moved to an object file you can include at link time. Then a stub macro could wrap a CALL to the main printing function.
The code doesn't use printf because at some point I thought I saw a comment that you couldn't use the C library. If that's not the case this can be simplified greatly by calling printf to format the output to print a 64-bit hexadecimal value.
Just for fun, here are a couple other sequences for storing a hex string from a register. Printing the buffer is not the interesting part, IMO; copy that part from Michael's excellent answer if needed.
I tested some of these. I've included a main that calls one of these functions and then uses printf("%s\n%lx\n", result, test_value); to make it easy to spot problems.
Test main():
extern printf
global main
main:
push rbx
mov rdi, 0x1230ff56dcba9911
mov rbx, rdi
sub rsp, 32
mov rsi, rsp
mov byte [rsi+16], 0
call register_to_hex_ssse3
mov rdx, rbx
mov edi, fmt
mov rsi, rsp
xor eax,eax
call printf
add rsp, 32
pop rbx
ret
section .rodata
fmt: db `%s\n%lx\n`, 0 ; YASM doesn't support `string with escapes`, so this only assembles with NASM.
; NASM needs
; %use smartalign
; ALIGNMODE p6, 32
; or similar, to stop it using braindead repeated single-byte NOPs for ALIGN
SSSE3 pshufb for the LUT
This version doesn't need a loop, but the code size is much larger than the rotate-loop versions because SSE instructions are longer.
section .rodata
ALIGN 16
hex_digits:
hex_xlat: db "0123456789abcdef"
section .text
;; rdi = val rsi = buffer
ALIGN 16
global register_to_hex_ssse3
register_to_hex_ssse3: ;;;; 0x39 bytes of code
;; use PSHUFB to do 16 nibble->ASCII LUT lookups in parallel
movaps xmm5, [rel hex_digits]
;; x86 is little-endian, but we want the hex digit for the high nibble to be the first character in the string
;; so reverse the bytes, and later unpack nibbles like [ LO HI ... LO HI ]
bswap rdi
movq xmm1, rdi
;; generate a constant on the fly, rather than loading
;; this is a bit silly: we already load the LUT, might as well load another 16B from the same cache line, a memory operand for PAND since we manage to only use it once
pcmpeqw xmm4,xmm4
psrlw xmm4, 12
packuswb xmm4,xmm4 ; [ 0x0f 0x0f 0x0f ... ] mask for low-nibble of each byte
movdqa xmm0, xmm1 ; xmm0 = low nibbles at the bottom of each byte
psrlw xmm1, 4 ; xmm1 = high nibbles at the bottom of each byte (with garbage from next byte)
punpcklbw xmm1, xmm0 ; unpacked nibbles (with garbage in the high 4b of some bytes)
pand xmm1, xmm4 ; mask off the garbage bits because pshufb reacts to the MSB of each element. Delaying until after interleaving the hi and lo nibbles means we only need one
pshufb xmm5, xmm1 ; xmm5 = the hex digit for the corresponding nibble in xmm0
movups [rsi], xmm5
ret
AVX2: you can do two integers at once, with something like
int64x2_to_hex_avx2: ; (const char buf[32], uint64_t first, uint64_t second)
bswap rsi ; We could replace the two bswaps with one 256b vpshufb, but that would require a mask
vmovq xmm1, rsi
bswap rdx
vpinsrq xmm1, xmm1, rdx, 1
vpmovzxbw ymm1, xmm1 ; upper lane = rdx, lower lane = rsi, with each byte zero-extended to a word element
vpsllw ymm1, ymm1, 12 ; shift the high nibbles out, leaving the low nibbles at the top of each word
vpor ymm0, ymm0, ymm1 ; merge while hi and lo elements both need the same shift
vpsrlw ymm1, ymm1, 4 ; low nibbles in elems 1, 3, 5, ...
; high nibbles in elems 0, 2, 4, ...
pshufb / store ymm0 / ret
Using pmovzx and shifts to avoid pand is a win compared to generating the constant on the fly, I think, but probably not otherwise. It takes 2 extra shifts and a por. It's an option for the 16B non-AVX version, but it's SSE4.1.
Optimized for code-size (fits in 32 (0x20) bytes)
(Derived from Frank's loop)
Using cmov instead of the LUT to handle 0-9 vs. a-f might take fewer than 16B of extra code size. That might be fun: edits welcome.
The ways to get a nibble from the bottom of rsi into an otherwise-zeroed rax include:
mov al, sil (3B (REX required for sil)) / and al, 0x0f (2B special encoding for and al, imm8).
mov eax, esi (2B) / and eax, 0x0f (3B): same size and doesn't require an xor beforehand to zero the upper bytes of rax.
Would be smaller if the args were reversed, so the dest buffer was already in rdi. stosb is a tiny instruction (but slower than mov [rdi], al / inc rdi), so it actually saved overall bytes to use xchg rdi, rsi to set up for it. changing the function signature could save 5 bytes: void reg_to_hex(char buf[16], uint64_t val) would save two bytes from not having to return buf in rax, and 3 bytes from dropping the xchg. The caller will probably use 16B of stack, and having the caller do a mov rdx, rsp instead of mov rdx, rax before calling another function / syscall on the buffer doesn't save anything.
The next function is probably going to ALIGN 16, though, so shrinking the function to even smaller than 32B isn't as useful as getting it inside half a cache-line.
Absolute addressing for the LUT (hex_xlat) would save a few bytes
(use mov al, byte [hex_xlat + rax] instead of needing the lea).
global register_to_hex_size
register_to_hex_size:
push rsi ; pushing/popping return value (instead of mov rax, rsi) frees up rax for stosb
xchg rdi, rsi ; allows stosb. Better: remove this and change the function signature
mov cl, 16 ; 3B shorter than mov ecx, 16
lea rdx, [rel hex_xlat]
;ALIGN 16
.loop:
rol rsi, 4
mov eax, esi ; mov al, sil to allow 2B AND AL,0xf requires a 2B xor eax,eax
and eax, 0x0f
mov al, byte [rdx+rax]
stosb
;; loop .loop ; setting up ecx instead of cl takes more bytes than loop saves
dec cl
jne .loop
pop rax ; get the return value back off the stack
ret
Using xlat costs 2B (to save/restore rbx), but saves 3B, for a net savings of 1B. It's a 3-uop instruction, with 7c latency, one per 2c throughput (Intel Skylake). The latency and throughput aren't a problem here, since each iteration is a separate dependency chain, and there's too much overhead for this to run at one clock per iteration anyway. So the main problem is that it's 3 uops, making it less uop-cache-friendly. With xlat, the loop becomes 10 uops instead of 8 (using stosb), so that sucks.
112: 89 f0 mov eax,esi
114: 24 0f and al,0xf
116: d7 xlat BYTE PTR ds:[rbx]
117: aa stos BYTE PTR es:[rdi],al
vs.
f1: 89 f0 mov eax,esi
f3: 83 e0 0f and eax,0xf
f6: 8a 04 02 mov al,BYTE PTR [rdx+rax*1]
f9: aa stos BYTE PTR es:[rdi],al
Interestingly, this still has no partial-register stalls, because we never read a wide register after writing only part of it. mov eax, esi is write-only, so it cleans up the partial-reg-ness from the load into al. So there would be no advantage to using movzx eax, byte [rdx+rax]. Even when we return to the caller, the pop rax doesn't leave the caller succeptible to partial-reg problems.
(If we don't bother returning the input pointer in rax, then the caller could have a problem. Except in that case it shouldn't be reading rax at all. Usually it only matters if you call with call-preserved registers in a partial-reg state, because the called function might push them. Or more obviously, with arg-passing / return-value registers.
Efficient version (uop-cache friendly)
Looping backwards didn't turn out to save any instructions or bytes, but I've included this version because it's more different from the version in Frank's answer.
ALIGN 16
global register_to_hex_countdown
register_to_hex_countdown:
;;; work backwards in the buffer, starting with the least-significant nibble as the last char
mov rax, rsi ; return value, and loop bound
add rsi, 15 ; last char of the buffer
lea rcx, [rel hex_xlat] ; position-independent code
ALIGN 16
.loop:
mov edx, edi
and edx, 0x0f ; isolate low nibble
mov dl, byte [rcx+rdx] ; look up the ascii encoding for the hex digit
; rdx is an 'index' with range 0x0 - 0xf
; non-PIC version: mov dl, [hex_digits + rdx]
mov byte [rsi], dl
shr rdi, 4
dec rsi
cmp rsi, rax
jae .loop ; rsi counts backwards down to its initial value
ret
The whole thing is only 12 insns (11 uops with macro-fusion, or 12 including the NOP for alignment). Some CPUs can fuse cmp/jcc but not dec/jcc (e.g. AMD, and Nehalem)
Another option for looping backwards was mov ecx, 15, and store with mov [rsi+rcx], dl, but two-register addressing modes can't micro-fuse. Still, that would only bring the loop up to 8 uops, so it would be fine.
Instead of always storing 16 digits, this version could use rdi becoming zero as the loop condition to avoid printing leading zeros. i.e.
add rsi, 16
...
.loop:
...
dec rsi
mov byte [rsi], dl
shr rdi, 4
jnz .loop
; lea rax, [rsi+1] ; correction not needed because of adjustments to how rsi is managed
mov rax, rsi
ret
printing from rax to the end of the buffer gives just the significant digits of the integer.
I am making an addition program using x64 assembly, but it does not display a value when run (compiled with nasm, elf64).
section .text
global _start
_start:
mov rax, 0
add rax, [num1B]
add rax, [num2B]
mov [result], rax
mov rsi, [result]
;mov rdx, 8
mov rax, 4
mov rdi, 1
int 80h
mov rax, 1
mov rdi, 0
int 080h
section .data
num1B: dq 0Ah
num2B: dq 0Ah
result: dq 00h
Does anyone know why this is not displaying anything
1.later use printf instead of interrupts, much better,
2.why put values of numB1 and numB2 instead of their location.
Use: mov rax, numB1.
3.in 64bit nasm assembly you use the:
rdi, rsi, rbx, rcx,... Registers for putting in values for interrupts.
For example:
mov rdi, 01
mov rsi, 00
syscall
DON't USE int0x80!, for more-portability use syscall and besides int 0x80 didn't work on my system.
Hope it helps, Correct me if I'm wrong.
Looks like you want to print 'result' to stdout and you are using the 32-bit
system call values.
In 64-bit linux the system call for write is 1 and you would write to stdout like this... att&t syntax:
first strip values in %rax and push them byte by byte on stack, say %rax
holds the value 0x7ffffff8:
mov $0xa, %rbx # divisor
nibble:
xor %rdx, %rdx # will hold bytes values you need
div %rbx, %rax
push %rdx # save remainder
inc %r8 # count digit, write seems to trash %rcx
cmp $0, %rax # done?
jne nibble # no, get another digit
#set up for write to stdout
mov $1, %rax # sys_call for write
mov $1, %rdi # write to stdout
mov $result, %rsi # addr. of value to print
# now get values from stack, make ascii and write to stdout
decimal:
pop %rdx # get digit off stack
add $0x30, %dl # make ascii printable
movb %dl, result # load addr. with value
mov $1, %rdx # print 1 byte
syscall
dec %r8
jnz decimal # go till %r8 is zero
You just need to set up a 1 byte data holder for digits,either in data section:
.section .data
result:
.byte 0 # reserves 1 byte and inits to 0
or the uninitialized data area:
.section .bss
.lcomm result, 1 # reserves 1 byte
I'm sure there are better ways to do this, should give you some ideas though.
Get a 64-bit system call list, they have changed quite a lot from the 32-bit calls.
I am using 64-bit linux and programming in assembler using gas. The issue I am having is I let the user enter lets say "1 + 12" using the system call read, and saving it as follows.
My read function:
.type _read, #function
_read:
pushq %rbp # Save old base pointer
movq %rsp,%rbp
movq $200,%rdx # MAX characters to retrieve
movq $equation,%rsi # Buffer for equation string
movq $0,%rdi # STDIN
movq $0,%rax # SYS_READ
syscall
movq %rbp,%rsp # Restore base pointer
popq %rbp
ret # Return from function
equation is declared as:
.section .bss
.lcomm equation, 200
So I parse through each byte of equation trying to save the numbers, but if they enter "12" than I would first get 1 and than 2, I need to somehow save 12 on the stack and be able to just popq %rax and have "12" in there. I am not sure how to go about this? Any input would be greatly appreciated.
You'll have to write some sort of parser. Here's an example (I'm using 16-bit assembly in Intel syntax, but you get the gist of it):
; Parses the zero-terminated string 'equation', converts any numbers
; found in that string from strings to integers and pushes them on
; the stack.
parse_equation:
pop di ; pop the return address
lea si,[equation]
cld
xor cl,cl ; # of chars in the currently parsed number
skip_non_number:
lodsb
test al,al
jz end_of_equation
cmp al,'0'
jb skip_non_number
cmp al,'9'
ja skip_non_number
sub al,'0' ; convert '0'..'9' -> 0..9
movzx bx,al ; zero-extend to word and store in bx
parse_number:
inc cl
lodsb
test al,al
jz end_of_equation
cmp al,'0'
jb end_of_number
cmp al,'9'
ja end_of_number
sub al,'0'
mov ch,al
mov ax,10
mul bx
movzx bx,ch
add bx,ax ; bx = bx*10 + (word)al
jmp parse_number
end_of_number:
push bx ; store the parsed number on the stack
xor cl,cl
jmp skip_non_number ; start over again
end_of_equation:
test cl,cl
jz nothing_to_push
push bx ; the string ended with a number; push it
nothing_to_push:
jmp di ; return
My code ignores anything that isn't a number (like arithmetic operators), and doesn't handle signed numbers. I'll leave it to you to figure out how to handle such things.