The following 64-bit NASM code uses lock cmpxchg to take each core in core order, execute some code, then reset the core number variable using xchg so the next core can execute the code. The core number for each core is stored in rbx -- the four cores are numbered 0, 8, 16 and 24. The variable [spin_lock_core] starts at zero and when each core is finished it updates the core number by 8 at the final line xchg [spin_lock_core],rax.
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
But before the code reaches xchg [spin_lock_core],rax the first core loops out of the program (jmp label_899), which should cause the other threads to freeze because they would be waiting for the [spin_lock_core] var to be updated, which never happens. But instead all four cores are written to the output array extra_test_array, which is displayed on the terminal when the program exits. In other words, this fails to stop the cores until the core number is updated.
The full, minimal code is below (as minimal as NASM can be in this case). The code is written for a shared object, and it's reproducible if it gets an input array (as written it doesn't matter if the input array is int or float):
; Header Section
[BITS 64]
[default rel]
global Main_Entry_fn
extern pthread_create, pthread_join, pthread_exit, pthread_self, sched_getcpu
global FreeMem_fn
extern malloc, realloc, free
extern sprintf
section .data align=16
X_ctr: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
XMM_Stack: dq 0, 0, 0, 0, 0, 0, 0
ThreadID: dq 0
X_ptr: dq 0
X_length: dq 0
X: dq 0
collect_ptr: dq 0
collect_length: dq 0
collect_ctr: dq 0
even_squares_list_ptrs: dq 0, 0, 0, 0
even_squares_list_ctr: dq 0
even_squares_list_length: dq 0
Number_Of_Cores: dq 32
pthread_attr_t: dq 0
pthread_arg: dq 0
Join_Ret_Val: dq 0
tcounter: dq 0
sched_getcpu_array: times 4 dq 0
ThreadIDLocked: dq 0
spin_lock_core: dq 0
extra_test_array: dq 0
; __________
section .text
Init_Cores_fn:
; _____
; Create Threads
label_0:
mov rdi,ThreadID ; ThreadCount
mov rsi,pthread_attr_t ; Thread Attributes
mov rdx,Test_fn ; Function Pointer
mov rcx,pthread_arg
call pthread_create wrt ..plt
mov rdi,[ThreadID] ; id to wait on
mov rsi,Join_Ret_Val ; return value
call pthread_join wrt ..plt
mov rax,[tcounter]
add rax,8
mov [tcounter],rax
mov rbx,[Number_Of_Cores]
cmp rax,rbx
jl label_0
; _____
jmp label_900 ; All threads return here, and exit
; ______________________________________
Test_fn:
; Get the core number
call sched_getcpu wrt ..plt
mov rbx,8 ; multiply by 8
mul rbx
push rax
pop rax
mov rbx,rax
push rax
Spin_lock:
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
;__________
label_899:
pop rax
ret
; __________
label_900:
mov rdi,extra_test_array ;audit_array
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
;The pointer is passed back in rcx (of course)
sub rsp,40
call free wrt ..plt
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
push r8
xor r8,r8
movsd [XMM_Stack+0],xmm13
movsd [XMM_Stack+8],xmm12
movsd [XMM_Stack+16],xmm11
movsd [XMM_Stack+24],xmm15
movsd [XMM_Stack+32],xmm14
movsd [XMM_Stack+40],xmm10
mov [X_ptr],rdi
mov [data_master_ptr],rsi
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; Write variables to assigned registers
mov r15,0
lea rdi,[rel collect_ptr]
mov r14,qword[rdi]
mov r13,[collect_ctr]
mov r12,[collect_length]
lea rdi,[rel X_ptr]
mov r11,qword[rdi]
mov r10,[X_length]
; __________
call Init_Cores_fn
movsd xmm10,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
movsd xmm11,[XMM_Stack+24]
movsd xmm12,[XMM_Stack+32]
movsd xmm13,[XMM_Stack+40]
pop r8
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
The instruction "lock cmpxchg" should fail until the [spin_lock_core] variable is updated, but it doesn't do that.
Thanks for any help in understanding why lock cmpxchg doesn't prevent the cores after core zero from firing in this area of code.
UPDATE: other research shows that xor rax,rax is needed at the top of the Spin_lock: section. When I insert that line, it reads like this:
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
With that change it freezes, as expected. But when I remove the line jmp label_899 it still freezes, but it shouldn't do that.
EDIT 122219:
Based on the comments on this question yesterday, I revised the spinlock code to (1) eliminate atomic operations in favor of faster mov and cmp instructions, (2) assign a unique memory location to each core, and (3) separate the memory locations by > 256 bytes to avoid memory on the same cache line.
Each core's memory location will be changed to 1 when the previous core is finished. When each core finishes, it sets its own memory location back to 0.
The code successfully executes core 0 IF I have all other cores loop out before the spinlock. When I let all four cores run through the spinlock, the program again hangs.
I've verified that each separate memory location is set to 1 when the previous core is finished.
Here's the updated spinlock section:
section .data
spin_lock_core: times 140 dq 0
spin_lock_core_offsets: dq 0,264,528,792
section .text
; Calculate the offset to spin_lock_core
mov rbp,spin_lock_core
mov rdi,spin_lock_core_offsets
mov rax,[rdi+rbx]
add rbp,rax
; ________
Spin_lock:
pause
cmp byte[rbp],1
jnz Spin_lock
xor rax,rax
mov [rbp],rax ; Set current memory location to zero
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rdx
mov rcx,rax
; Loop out if this is the last core
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge label_899
; Set next core to 1 by adding 264 to the base address
add rbp,264
mov rax,1
mov [rbp],rax
Why does this code still hang?
I don't think you should use cmpxchg for this at all. Try this:
Spin_lock:
pause
cmp [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
lea rax,[rbx+8]
mov [spin_lock_core],rax
I solved this spinlock problem, but after Peter Cordes' comment below I see that it is not correct. I won't delete this answer because I hope it can lead to the solution.
I use lock cmpxchg [rbp+rbx],rbx, which assembles without error, but the NASM assembler should return a "invalid combination of operands" error because the source operand can only be rax, so it shouldn't assemble with any other register. I also note that the online resources (for example, https://www.felixcloutier.com/x86/cmpxchg) show the format as CMPXCHG r/m64,r64, but the source operand can't be any r64 -- it must be rax, as that entry goes on to say.
Without the "mov rax,rbx" line it works because on the first iteration the rax register is set to 0 which matches the memory location. On the second iteration it succeeds by default.
When I add "mov rax,rbx" -- which resets rax -- the program once again hangs. I would really appreciate any ideas on why this program should hang as written.
At the start of this block rbx is the core number:
section .data
spin_lock_core: times 4 dq 0
section .text
[ Code leading up to this spinlock section shown above ]
mov rbp,spin_lock_core
Spin_lock:
pause
mov rax,rbx
lock cmpxchg [rbp+rbx],rax
jnz Spin_lock
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge spin_lock_out
xchg [rbp+rax],rax
spin_lock_out:
The differences from my original post are:
Each core spins on (and reads from) its own unique memory location.
I use the "pause" instruction on the spinlock.
Each unique memory location is updated in core order.
But it does not work when I include mov rax,rbx. Intuitively that should work, so I will really appreciate any ideas on why it doesn't in this case.
This question already has answers here:
Nasm segmentation fault on RET in _start
(1 answer)
Assembly segmentation fault after making a system call, at the end of my code
(1 answer)
Closed 4 years ago.
Program received signal SIGSEGV, Segmentation fault.
I am using this assembly program:
and for some reason I'm getting a segmentation fault, is it related to the stack? What does this program even supposed to output? somebody told me to run it and see what it does, any ideas?
I thought it would be problematic and made a post about it, but it wasn't a problem, I tried debugging it with GDB and that didn't really help. If you could tell me what it outputs it would probably even be better, that's what I'm really looking for
.code32
.intel_syntax noprefix
.global _start
_start:
mov eax, 0x04
push eax
mov eax, 0x2d
push eax
call asm2
asm2:
push ebp
mov ebp, esp
sub esp, 0x10
mov eax, DWORD PTR [ebp + 0xc] --- seg fault here
mov DWORD PTR [ebp-0x4], eax
mov eax, DWORD PTR[ebp+0x8]
mov DWORD PTR [ebp-0x8], eax
jmp part_b
part_a:
add DWORD PTR [ebp - 0x4], 0x1
add DWORD PTR [ebp + 0x8], 0x64
part_b:
cmp DWORD PTR [ebp+0x8], 0x1d89
jle part_a
mov eax, DWORD PTR [ebp-0x4]
mov esp, ebp
pop ebp
ret
I was experimenting and have the following assembly code, which works very well, except that I get a "Segmentation fault (core dumped)" message right before my program ends:
GLOBAL _start
%define ___STDIN 0
%define ___STDOUT 1
%define ___SYSCALL_WRITE 0x04
segment .data
segment .rodata
L1 db "hello World", 10, 0
segment .bss
segment .text
_start:
mov eax, ___SYSCALL_WRITE
mov ebx, ___STDOUT
mov ecx, L1
mov edx, 13
int 0x80
It doesn't matter whether or not I have ret at the end; I still get the message.
What's the problem?
I'm using x86 and nasm.
You can't ret from start; it isn't a function and there's no return address on the stack. The stack pointer points at argc on process entry.
As n.m. said in the comments, the issue is that you aren't exiting the program, so execution runs off into garbage code and you get a segfault.
What you need is:
;; Linux 32-bit x86
%define ___SYSCALL_EXIT 1
// ... at the end of _start:
mov eax, ___SYSCALL_EXIT
mov ebx, 0
int 0x80
(The above is 32-bit code. In 64-bit code you want mov eax, 231 (exit_group) / syscall, with the exit status in EDI. For example:
;; Linux x86-64
xor edi, edi ; or mov edi, eax if you have a ret val in EAX
mov eax, 231 ; __NR_exit_group
syscall
Following this thread, How do i read single character input from keyboard using nasm (assembly) under ubuntu? ,I'm trying to compile a program that echoes the input in NASM.
I've made following files:
my_load2.asm:
%include "testio.inc"
global _start
section .text
_start: mov eax, 0
call canonical_off
call canonical_on
testio.inc:
termios: times 36 db 0
stdin: equ 0
ICANON: equ 1<<1
ECHO: equ 1<<3
canonical_off:
call read_stdin_termios
; clear canonical bit in local mode flags
push rax
mov eax, ICANON
not eax
and [termios+12], eax
pop rax
call write_stdin_termios
ret
echo_off:
call read_stdin_termios
; clear echo bit in local mode flags
push rax
mov eax, ECHO
not eax
and [termios+12], eax
pop rax
call write_stdin_termios
ret
canonical_on:
call read_stdin_termios
; set canonical bit in local mode flags
or dword [termios+12], ICANON
call write_stdin_termios
ret
echo_on:
call read_stdin_termios
; set echo bit in local mode flags
or dword [termios+12], ECHO
call write_stdin_termios
ret
read_stdin_termios:
push rax
push rbx
push rcx
push rdx
mov eax, 36h
mov ebx, stdin
mov ecx, 5401h
mov edx, termios
int 80h
pop rdx
pop rcx
pop rbx
pop rax
ret
write_stdin_termios:
push rax
push rbx
push rcx
push rdx
mov eax, 36h
mov ebx, stdin
mov ecx, 5402h
mov edx, termios
int 80h
pop rdx
pop rcx
pop rbx
pop rax
ret
Then I run:
[root#localhost asm]# nasm -f elf64 my_load2.asm
[root#localhost asm]# ld -m elfx86_64 my_load2.o -o my_load2
When I try to run it i get:
[root#localhost asm]# ./my_load2
Segmentation fault
Debugger says:
(gdb) run
Starting program: /root/asm/my_load2
Program received signal SIGSEGV, Segmentation fault.
0x00000000004000b1 in canonical_off ()
Can someone explain why is it crashing without on "import" step?
Also, I am running RHEL in Virtualbox under Win7 64 bit. Can this cause problems with compilation?
Firstly, let's address the issue of not exiting, as mentioned by Daniel. Let's comment out the two call instructions, so the program essentially does nothing:
%include "testio.inc"
global _start
section .text
_start: mov eax, 0
;call canonical_off
;call canonical_on
When we run this:
$ ./my_load2
Segmentation fault (core dumped)
It still dies! Daniel is right - you need to exit:
%include "testio.inc"
global _start
section .text
_start: mov eax, 0
;call canonical_off
;call canonical_on
mov eax, 1
mov ebx, 0
int 0x80
This time:
$ ./my_load2
$
No segfault. So let's uncomment the calls:
%include "testio.inc"
global _start
section .text
_start: mov eax, 0
call canonical_off
call canonical_on
mov eax, 1
mov ebx, 0
int 0x80
And run it again:
$ ./my_load2
Segmentation fault (core dumped)
We get a segfault again. But at least we can be (fairly) sure that's coming from inside one of the called routines.
Running the executable with strace is also quite informative:
$ strace ./my_load2
execve("./my_load2", ["./my_load2"], [/* 57 vars */]) = 0
setsockopt(0, SOL_IP, 0x400080 /* IP_??? */, NULL, 0) = -1 EFAULT (Bad address)
--- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_ACCERR, si_addr=0x40008c} ---
+++ killed by SIGSEGV (core dumped) +++
Segmentation fault (core dumped)
The setsockopt line is due to the ioctl request that happens in read_stdin_termios. strace tells us the return value was EFAULT. The setsockopt(2) man page tells us this happens when:
The address pointed to by optval is not in a valid part of the process address space.
Actually this is telling us that the block of memory into which the termios structure is written is read-only. Frank is correct; everything in the program - including the termios space, and all the code - is in the read-only .text section. You can see this with:
$ objdump -h my_load2.o
my_load2.o: file format elf64-x86-64
Sections:
Idx Name Size VMA LMA File off Algn
0 .text 000000cd 0000000000000000 0000000000000000 000001c0 2**4
CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
i.e. there's only one section, .text, and it's READONLY.
The line that actually causes the segfault, however, is this one:
and [termios+12], eax
because it also tries to write to the (read-only) termios memory.
The quickest way to fix this is to put the termios memory in the .data section, and everything else in the .text section:
section .data
termios: times 36 db 0
section .text
stdin: equ 0
ICANON: equ 1<<1
ECHO: equ 1<<3
canonical_off:
call read_stdin_termios
[...]
(stdin, ICANON, and ECHO can be in the read-only .text section, because they're just used as constants - i.e. we don't write to those bits of memory.)
Having made these changes:
$ ./my_load2
$
The program runs and exits normally.
I was experimenting and have the following assembly code, which works very well, except that I get a "Segmentation fault (core dumped)" message right before my program ends:
GLOBAL _start
%define ___STDIN 0
%define ___STDOUT 1
%define ___SYSCALL_WRITE 0x04
segment .data
segment .rodata
L1 db "hello World", 10, 0
segment .bss
segment .text
_start:
mov eax, ___SYSCALL_WRITE
mov ebx, ___STDOUT
mov ecx, L1
mov edx, 13
int 0x80
It doesn't matter whether or not I have ret at the end; I still get the message.
What's the problem?
I'm using x86 and nasm.
You can't ret from start; it isn't a function and there's no return address on the stack. The stack pointer points at argc on process entry.
As n.m. said in the comments, the issue is that you aren't exiting the program, so execution runs off into garbage code and you get a segfault.
What you need is:
;; Linux 32-bit x86
%define ___SYSCALL_EXIT 1
// ... at the end of _start:
mov eax, ___SYSCALL_EXIT
mov ebx, 0
int 0x80
(The above is 32-bit code. In 64-bit code you want mov eax, 231 (exit_group) / syscall, with the exit status in EDI. For example:
;; Linux x86-64
xor edi, edi ; or mov edi, eax if you have a ret val in EAX
mov eax, 231 ; __NR_exit_group
syscall