movdqa segfault in custom asm script [duplicate] - linux

This question already has answers here:
After entering _start, is rsp aligned?
(1 answer)
Should %rsp be aligned to 16-byte boundary before calling a function in NASM?
(1 answer)
Why does the x86-64 / AMD64 System V ABI mandate a 16 byte stack alignment?
(1 answer)
Closed 9 months ago.
I have the following code snippet (https://godbolt.org/z/cE1qE9fvv) which contains a naive & vectorized version of a dot product.
I decided to make the vectorized version compile in standalone asm file as following:
extern exit
section .text
global _start
_start:
mov rax, 8589934593
mov QWORD [rsp-72], rax
mov rax, 17179869187
mov QWORD [rsp-64], rax
mov rax, 25769803781
mov QWORD [rsp-56], rax
mov rax, 34359738375
mov QWORD [rsp-48], rax
mov rax, 85899345930
mov QWORD [rsp-40], rax
mov rax, 171798691870
mov QWORD [rsp-32], rax
mov rax, 257698037810
mov QWORD [rsp-24], rax
mov rax, 343597383750
mov QWORD [rsp-16], rax
movdqa xmm1, [rsp-72]
movdqa xmm0, [rsp-24]
pmulld xmm1, [rsp-40]
pmulld xmm0, [rsp-56]
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
.exit:
call exit
I use the following to build: nasm -f elf64 dot_product.asm && gcc -g -no-pie -nostartfiles -o dot_product dot_product.o
The above code segfault at movdqa xmm0, XMMWORD PTR [rsp-72] which probably means that the data is not 16-bytes aligned. However, the following screenshot seems to indicate the opposite:
Am I misunderstanding something ?

Related

Intel x86 (IA32) assembly decoder stub for custom encoder not working as expected

I have written a custom encoder which encodes my shellcode in this way:
First it reverses(swaps) all adjacent bytes in the original shellcode, and then it XORs each byte with value "0xaa" - I did all sanity check to ensure my original shellcode doesn't have this value, which might break my shellcode (by causing bad characters as a result of the encode). Output of my encoder:
Original Shellcode( 25 Bytes) :
0x31,0xc0,0x50,0x68,0x2f,0x2f,0x6c,0x73,0x68,0x2f,0x62,0x69,0x6e,0x89,0xe3,0x50,0x89,0xe2,0x53,0x89,0xe1,0xb0,0xb,0xcd,0x80,
Step1(Reverse adjacent Bytes)-Encoded Shellcode( 25 Bytes) :
0xc0,0x31,0x68,0x50,0x2f,0x2f,0x73,0x6c,0x2f,0x68,0x69,0x62,0x89,0x6e,0x50,0xe3,0xe2,0x89,0x89,0x53,0xb0,0xe1,0xcd,0xb,0x80,
Step2(XOR-each-BYTE-with-0xaa)-Encoded Shellcode( 25 Bytes) :
0x6a,0x9b,0xc2,0xfa,0x85,0x85,0xd9,0xc6,0x85,0xc2,0xc3,0xc8,0x23,0xc4,0xfa,0x49,0x48,0x23,0x23,0xf9,0x1a,0x4b,0x67,0xa1,0x2a,
My original shellcode's purpose: it just executes /bin/ls on Linux systems using the "execve" syscall. Full code:
global _start
section .text
_start:
; PUSH the first null dword
xor eax, eax
push eax
; PUSH //bin/sh (8 bytes)
push 0x68732f2f
push 0x6e69622f
mov ebx, esp
push eax
mov edx, esp
push ebx
mov ecx, esp
mov al, 11
int 0x80
In order to execute the shellcode I'm practicing how to write a decoder stub, which will decode my custom encoded shellcode, and then execute it on a target machine.
This is my decoder stub assembly code:
global _start
section .text
_start:
xor eax, eax
xor ebx, ebx
xor ecx, ecx
xor edx, edx
mov cl, 12
jmp short call_decoder
; first : decode by XOR again with same value 0xaa
decode1:
pop esi
xor byte [esi], 0xaa
jz decode2
inc esi
jmp short decode1
; second: rearrange the reversed adjacent BYTES, as part of encoding
decode2:
pop esi
mov bl, byte [esi + eax]
mov dl, byte [esi + eax + 1]
xchg bl, dl
mov byte [esi + eax], bl
mov byte [esi + eax + 1], dl
add al, 2
loop decode2
; execute Shellcode
jmp short Shellcode
call_decoder:
call decode1
; an extra byte 0xaa added at the end of encoded shellcode, as a marker to end of shellcode bytes.
Shellcode: db 0x6a,0x9b,0xc2,0xfa,0x85,0x85,0xd9,0xc6,0x85,0xc2,0xc3,0xc8,0x23,0xc4,0xfa,0x49,0x48,0x23,0x23,0xf9,0x1a,0x4b,0x67,0xa1,0x2a,0xaa
But above code gives me a segment fault. I'm unable to find a failure point on gdb debugger. Need some help on what I'm doing wrong.
Based on comments made by #prl, these are the changes I did in my decoder stub, and now it works as expected:
global _start
section .text
; initialize registers
_start:
xor eax, eax
xor ebx, ebx
xor ecx, ecx
xor edx, edx
mov cl, 12
jmp short call_decoder
; set starting address of Shellcode in esi register
decoder:
pop esi
mov edi, esi
; first: decode by XOR again with same value 0xaa
decode1:
xor byte [edi], 0xaa
jz decode2
inc edi
jmp short decode1
; second: rearrange the reversed adjacent BYTES, as part of encoding
decode2:
mov bl, byte [esi + eax]
mov dl, byte [esi + eax + 1]
xchg bl, dl
mov byte [esi + eax], bl
mov byte [esi + eax + 1], dl
add al, 2
loop decode2
jmp short Shellcode
call_decoder:
call decoder
Shellcode: db 0x6a,0x9b,0xc2,0xfa,0x85,0x85,0xd9,0xc6,0x85,0xc2,0xc3,0xc8,0x23,0xc4,0xfa,0x49,0x48,0x23,0x23,0xf9,0x1a,0x4b,0x67,0xa1,0x2a,0xaa
EDIT2 :
A much concise and a better looking code - also no need to hardcode the length of Shellcode:
global _start
section .text
_start:
xor eax, eax
xor ebx, ebx
xor ecx, ecx
jmp short call_decoder
decoder:
pop esi
mov cl, codeLen
dec cl
decode:
cmp al, cl
jz last_byte_odd
xor byte [esi + eax], 0xaa
mov bl, byte [esi + eax]
xor byte [esi + eax + 1], 0xaa
xchg byte [esi + eax + 1], bl
mov byte [esi + eax], bl
add al, 1
cmp al, cl
jz Shellcode
add al, 1
jmp short decode
last_byte_odd:
xor byte [esi + eax], 0xaa
jmp short Shellcode
call_decoder:
call decoder
Shellcode: db 0x6a,0x9b,0xc2,0xfa,0x85,0x85,0xd9,0xc6,0x85,0xc2,0xc3,0xc8,0x23,0xc4,0xfa,0x49,0x48,0x23,0x23,0xf9,0x1a,0x4b,0x67,0xa1,0x2a
codeLen equ $-Shellcode
I leave it up to the low level and shell-coding enthusiasts, to decipher the logic.

How to load constant value to xmm register? [duplicate]

This question already has answers here:
Add a constant value to a xmm register in x86
(2 answers)
How to move a floating-point constant value into an xmm register?
(2 answers)
Closed 1 year ago.
This program reads values from text file and compare or value are greater or not. The problem is that I can't load a constant value as a criteria for ucomisd function. The value in this case are always 0. Can you suggest how to sort out this problem? I searched also other methods how to load the constant value to yasm however it doesn't worked neither.
global st_did_suma
section .text
;-----------------------------------------------------------------------------
; double st_did_suma(double* matrica, uint64_t N)
; rdi rsi
;-----------------------------------------------------------------------------
st_did_suma:
push rbx
push r11
push r8
push rcx
push rdx
push rbp
mov rax, rsi
mul rsi
xorpd xmm0, xmm0
cmp rsi, 0
je .end
cmp rsi, 1
jnz .next
movsd xmm0, qword [rdi]
jmp .end
.next:
add rdi, 8
dec rsi
movsd xmm1, qword [rdi]
mov eax, 100.0 ; XMM2 VALUE ALLWAYS 0
movd xmm2, eax
ucomisd xmm1, xmm2 ; COMPARE FUNCTION
jb .else
mov r10, 1 ; ANSWER
cvtsi2sd xmm0, r10
jnz .next
.else:
subsd xmm1, xmm1
addsd xmm0, xmm1 ; ANSWER
jmp .end
.end:
pop rbp
pop rdx
pop rcx
pop r11
pop r8
pop rbx
ret

Printing binary string in assembly

I'm writing a program to print binary string of a hardcoded word. Here is how it looks like currently:
main.asm
section .text
global _start
extern _print_binary_content
_start:
push word [word_to_print] ; pushing word. Can we push just one byte?
call _print_binary_content
mov rax, 60
mov rdi, 0
syscall
section .data
word_to_print: dw 0xAB0F
printer.asm
SYS_BRK_NUM equ 0x0C
BITS_IN_WORD equ 0x10
SYS_WRITE_NUM equ 0x01
STD_OUT_FD equ 0x01
FIRST_BIT_BIT_MASK equ 0x01
ASCII_NUMBER_OFFSET equ 0x30
section .text
global _print_binary_content
_print_binary_content:
pop rbp
xor ecx, ecx ;zeroing rcx
xor ebx, ebx ;zeroing rbx
pop bx ;the word to print the binary content of
;sys_brk for current location
mov rax, SYS_BRK_NUM
mov rdi, 0
syscall
;end sys_brk
mov r12, rax ;save the current brake location
;sys_brk for memory allocation 16 bytes
lea rdi, [rax + BITS_IN_WORD]
mov rax, SYS_BRK_NUM
syscall
;end sys_brk
xor ecx, ecx
mov cl, byte BITS_IN_WORD - 1; used as a counter in the loop below
loop:
mov dx, bx
and dx, FIRST_BIT_BIT_MASK
add dx, ASCII_NUMBER_OFFSET
mov [r12 + rcx], dl
shr bx, 0x01
dec cl
cmp cl, 0
jge loop
mov rsi, r12
mov rax, SYS_WRITE_NUM
mov rdi, STD_OUT_FD
mov rdx, BITS_IN_WORD
syscall
push rbp ; pushing return address back
ret
If I compile link and run this program it works. But the question is about performance and maybe conventions of writing assembly programs. In the file printer.asm I cleaned ecx twice which looks kind of not optimal. Maybe some registers were used not by their purpose (I used intel-manual).
Can you please help me to improve this very simple program?

How to compare the count of command line arguments correctly in NASM?

I am learning x86_64 NASM assembly on Ubuntu 16.10 on Docker for Mac.
The following program takes two command line arguments, and sum these.
If number of command line arguments is not two, print error message (jump to argcError).
When I exec this program, it jump to argcError section despite passed to two command line arguments.
Why this program jump to argError?
section .data
SYS_WRITE equ 1
STD_IN equ 1
SYS_EXIT equ 60
EXIT_CODE equ 0
NEW_LINE db 0xa
WRONG_ARGC db "Must be two command line arguments", 0xa
section .text
global _start
_start:
pop rcx
cmp rcx, 3
jne argcError
add rsp, 8
pop rsi
call str_to_int
mov r10, rax
pop rsi
call str_to_int
mov r11, rax
add r10, r11
argcError:
mov rax, 1
mov rdi, 1
mov rsi, WRONG_ARGC
mov rdx, 35
syscall
jmp exit
str_to_int:
xor rax, rax
mov rcx, 10
next:
cmp [rsi], byte 0
je return_str
mov bl, [rsi]
sub bl, 48
mul rcx ; rax = rax * rcx
add rax, rbx
inc rsi
jmp next
return_str:
ret
int_to_str:
mov rdx, 0
mov rbx, 10
div rbx
add rdx, 48
add rdx, 0x0
push rdx
inc r12
cmp rax, 0x0
jne int_to_str
jmp print
print:
; calculate byte length of number string
mov rax, 1
mul r12
mov r12, 8
mul r12
mov rdx, rax
; print sum
mov rax, SYS_WRITE
mov rdi, STD_IN
mov rsi, rsp
syscall
jmp printNewline
printNewline:
mov rax, SYS_WRITE
mov rdi, STD_IN
mov rsi, NEW_LINE
mov rdx, 1
syscall
jmp exit
exit:
mov rax, SYS_EXIT
mov rdi, EXIT_CODE
syscall
There probably other errors in your code as pointed out by Micheal Petch, but the way you've initialized RSI is incorrect. Yes, ESP does point to the number of arguments passed, but popping it off the stack and then adding 8 to ESP again is functionally equivalent too.
mov rcx, [rsp]
Then by popping into RSI it only becomes a copy of RCX. If you want to do that it should look like this
pop rcx
.......
add rsp, 24 ; Now RSP is pointing to proper place in array of pointers
pop rsi
add rsp, 16 ; Now point to pointer to second argument
pop rsi
An alternative would be this next example only because my personal preference is not to use stack pointer for other than that which it was intended.
mov rsi, rsp
lodsq ; Read # of arguments passed by OS
add rsi, 8 ; bounce over application name
cmp al, 3
jnz argError
push rsi
lodsq
mov rsi, rax ; RSI points to first agument
call Convert
pop rsi
lodsq
mov rsi, rax
call Convert

Uncorrect work equal assembly code (x86_64 linux)

I'm trying to write a simple compiler. My language can calculate an arithmetic expression using float numbers, save a variable and print a variable. In the first version of my compiler all my calculations only using a push and pop. Then I make calculate with call absolute stack address. I have not allocated memory with sub rsp, but rather I used [rsp-8*x], where x is number of value on the stack, like a push and pop. But the issue is that it does not work with pow function from libc. I can't understand, what I'm doing wrong.
I refactored the code a bit, for your convenience.
My first version in assembly (nasm syntax):
[bits 64]
global _start
extern printf
extern pow
section .data
printf_format db '%lf', 10, 0
section .text
_start:
mov rbp, rsp
sub rsp, 0x20
mov rax, 0x4000000000000000
push rax
mov rax, 0x4000000000000000
push rax
mov rax, 0x4008000000000000
push rax
movsd xmm0, qword [rsp+8]
movsd xmm1, qword [rsp]
call pow
movsd qword [rsp+8], xmm0
add rsp, 8
movsd xmm0, qword [rsp+8]
movsd xmm1, qword [rsp]
call pow
mov rdi, printf_format
mov rax, 1
call printf
mov rax, 60
mov rdi, 0
syscall
My second version:
[bits 64]
global _start
extern printf
extern pow
section .data
printf_format db '%lf', 10, 0
section .text
_start:
mov rbp, rsp
sub rsp, 0x20
mov rax, 0x4000000000000000
mov qword [rsp-8*1], rax
mov rax, 0x4000000000000000
mov qword [rsp-8*2], rax
mov rax, 0x4008000000000000
mov qword [rsp-8*3], rax
movsd xmm0, qword [rsp-8*2]
movsd xmm1, qword [rsp-8*3]
call pow
movsd qword [rsp-8*2], xmm0
movsd xmm0, qword [rsp-8*1]
movsd xmm1, qword [rsp-8*2]
call pow
mov rdi, printf_format
mov rax, 1
call printf
mov rax, 60
mov rdi, 0
syscall
I compile and link this with:
nasm -f elf64 ex.asm
ld -lc -lm -m elf_x86_64 -I/lib/ld-linux-x86-64.so.2 ex.o -o ex
In the last version of my compiler, I wrote the call with [rsp+8*x] and allocated with sub rsp, and the problem was solved.
My question is: Why that change I made solved this problem?

Resources