Windows multicore program worked before but suddenly threads execute randomly

Windows multicore program worked before but suddenly threads execute randomly - multithreading

Below is a multithreaded (4-core) NASM 64 program. It's not minimal, but it's complete; I posted the complete code because a minimal example may not reveal the problem in this code.
This is my first multithreaded multicore program in NASM, and I was happy to see yesterday afternoon that it worked correctly (all four cores) and returned the values I was expecting. I ran it successfully several times yesterday and again several times this morning.
Suddenly, about 10 minutes after my last run, without making any changes to the code, without re-assembling the dll, I ran it again and this time the threads execute randomly -- not all threads execute, and the configuration of threads that execute varies from run to run -- sometimes thread 1 only; sometimes threads 2-4 but not thread 1, sometimes only thread 2 and 4, etc.
My code did not have ExitThread or CloseHandle calls, so I added them in (see label_899:), rebooted and ran it again. The first run after a fresh reboot still shows random execution of threads.
I thought the problem may be that CreateThread was failing, so I also returned the thread handles. Every time, all four thread handles were created even though all did not execute, so that's not it.
I have done a lot of research but I haven't seen this issue discussed at all. Even though this is NASM, the same thing could also be applicable to C or C++ multithreading. I use WaitForMultipleObjects to wait on all threads to complete.
This is a dll, and the entry point is Main_Entry_fn. The dll is called from Python using ctypes, but I doubt that makes any difference. I use ctypes frequently and have never had a problem with it.
Thanks for any ideas.
P.S. ignore the realloc code because the buffers are large enough that realloc is never called.
UPDATE: Here is how I constructed the progam flow: the entry point is Main_Entry_fn toward the bottom of the program listing (it's an export because it has to be exported for a dll call). Main_Entry_fn takes the input data from the calling function, preserves registers (because the whole program is register optimized) and allocates output buffers. Main_Entry_fn then calls Init_Cores_fn (top of program listing), where we create four arrays with data for the registers on entry to each core (remember all vars are stored in registers -- see the call to DupThreadInfo), and creates four threads in a loop. The threads all call Prime_Number_fn, and the threads are set to start immediately upon creation.
Register assignments:
r15 list_of_results_ptr (pointer to list_of_results)
r14 list_of_results_ctr (counter for list_of_results)
r13 list_of_results_length
r12 numbers_ptr (pointer to "numbers" input array)
r11 numbers_ctr
r10 numbers_length
r9 num
r8 i
xmm15 num_float
xmm14: result
; Header Section
[BITS 64]
[default rel]
extern malloc, calloc, realloc, free
global Main_Entry_fn
export Main_Entry_fn
global FreeMem_fn
export FreeMem_fn
extern CreateThread, CloseHandle, ExitThread
extern WaitForMultipleObjects, GetCurrentThread
section .data align=16
Return_Pointer_Array: dq 0, 0, 0
Input_Length_Array: dq 0, 0,
list_of_results_ptr: dq 0
list_of_results_ctr: dq 0
list_of_results_length: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
internal_dynamic_length: dq 5000
XMM_Stack: dq 0, 0, 0
numbers_ptr: dq 0
numbers_length: dq 0
numbers: dq 0
list_of_results: dq 0
num_float: dq 0.0
loop_counter_401: dq 0
numbers_ctr: dq 0
num: dq 0
result: dq 0
const_1: dq 1
i: dq 0
range_loop_start_i: dq 0
range_loop_end_i: dq 0
const_0: dq 0
; New vars for threads:
ThreadCount: times 4 dq 0
ThreadInfo: times 10 dq 0
ThreadInfo2: times 10 dq 0
ThreadInfo3: times 10 dq 0
ThreadInfo4: times 10 dq 0
TestInfo: times 4 dq 0
ThreadHandles: times 4 dq 0
Return_Data_Array: times 8 dq 0 ; ptr,ptr,ptr,ptr,length,length,length,length
StartByte: dq 0
stride: dq 8
output_buffer_pointers: times 4 dq 0 ; for malloc
Division_Size: dq 0
Division_Start: dq 0
section .text
; ______________________________________
Init_Cores_fn:
; Calculate the data divisions
mov rax,r10
mov rbx,4 ;cores
xor rdx,rdx
div rbx
mov [Division_Size],rax
mov [Division_Start],rax
; Populate the ThreadInfo array with vars to pass
; ThreadInfo: length, startbyte, stride, vars into registers on entry to each core
mov rdi,ThreadInfo
mov rax,0 ;ThreadInfoLength
mov [rdi],rax ; length (number of vars into registers plus 3 elements)
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp]
mov [rdi+24],rax ;for r15
mov [rdi+32],r14
mov [rdi+40],r13
mov [rdi+48],r12
mov rax,[Division_Start]
mov [rdi+56],rax
mov rax,0
mov [rdi+64],rax
call DupThreadInfo
mov rbp,rsp ; preserve caller's stack frame
sub rsp,56 ; Shadow space (was 32)
; _____
label_0:
mov rax,[StartByte]
cmp rax,0
jne sb2
mov rdi,ThreadInfo
jmp sb5
sb2:cmp rax,8
jne sb3
mov rdi,ThreadInfo2
jmp sb5
sb3:cmp rax,16
jne sb4
mov rdi,ThreadInfo3
jmp sb5
sb4:cmp rax,24
jne sb5
mov rdi,ThreadInfo4
sb5:
; _____
; Create Threads
mov rcx,0 ; lpThreadAttributes (Security Attributes)
mov rdx,0 ; dwStackSize
mov r8,PrimeNumber_fn ; lpStartAddress (function pointer)
mov r9,rdi ; lpParameter (array of data passed to each core)
mov rax,0
mov [rsp+32],rax ; use default creation flags
mov rdi,ThreadCount
mov [rsp+40],rdi ; ThreadID
call CreateThread
; Move the handle into ThreadHandles array (returned in rax)
mov rdi,ThreadHandles
mov rcx,[StartByte]
mov [rdi+rcx],rax
mov rax,[StartByte]
add rax,8
mov [StartByte],rax
mov rbx,32 ; Four cores
cmp rax,rbx
jl label_0
; _____
; Wait
mov rcx,4 ;rax ; number of handles
mov rdx,ThreadHandles ; pointer to handles array
mov r8,0 ; wait for all threads to complete
mov r9,5000 ; milliseconds to wait
call WaitForMultipleObjects
; _____
mov rsp,rbp
jmp label_900
; ______________________________________
PrimeNumber_fn:
; Populate registers
;(note: rcx is the return value for ThreadProc)
mov rdi,rcx
mov rax,[rdi]
mov r15,[rdi+24]
mov r13,[rdi+40]
mov r12,[rdi+48]
mov r10,[rdi+56]
xor r11,r11
xor r9,r9
mov r8,[rdi+8] ; start byte
pxor xmm15,xmm15
pxor xmm15,xmm14
pxor xmm15,xmm13
; Get the ThreadID based on startbyte
mov rax,[rdi+72] ; 0, 8, 16, 24 for 4-core
push rax
;______
label_401:
mov rdi,r12 ; Pointer
cmp r8,r10
jge label_899
movsd xmm0,qword[rdi+r8]
movsd xmm15,xmm0
add r8,8
;______
cvttsd2si rax,xmm15
mov r9,rax
;______
label_8010:
label_801:
cmp r9,[const_1]
jle label_401
;______
label_12010:
mov rax,2
sub rax,1
movq xmm13,rax
label_1201:
movq rcx,xmm13
movq xmm1,[const_1]
addsd xmm13,xmm1
movq rcx,xmm13
mov rdx,r9
cmp rcx,rdx
jge label_12020
;______
label_16010:
label_1601:
mov rax,r9
movq rbx,xmm13
xor rdx,rdx
div rbx
mov rax,rdx
mov rdx,0
cmp rax,rdx
jne label_16020
;______
movq xmm14,[const_0]
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_17
;[Irrelevant code omitted]
next_17:
;______
jmp label_401
;______
label_1602:
label_16020:
;______
mov rax,r9
cvtsi2sd xmm14,rax
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_21
;[Irrelevant code omitted]
next_21:
jmp label_1201
;______
label_1202:
label_12020:
;______
movq xmm14,[const_0]
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_25
;[Irrelevant code omitted]
next_25:
jmp label_401
;______
label_899:
pop rax
mov rdi,Return_Data_Array
mov [rdi+rax],r15
mov [rdi+rax+32],r14 ; 32 = four cores
call ExitThread
call GetCurrentThread
mov rcx,rax
call CloseHandle
ret
; __________
label_900:
exit_label_for_PrimeNumber_fn:
mov rdi,Return_Data_Array ; Final return to the calling process
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
sub rsp,40
call free
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
movsd [XMM_Stack+0],xmm15
movsd [XMM_Stack+8],xmm14
movsd [XMM_Stack+16],xmm13
push r8
xor r8,r8
mov [numbers_ptr],rcx
mov [data_master_ptr],rdx
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [numbers_length],rax
add rcx,8
; __________
; malloc for dynamic arrays
lea rdi,[data_master_ptr]
mov rbp,[rdi]
movsd xmm0,qword[rbp]
cvttsd2si rax,xmm0
mov r8,rax
mov rdx,10
mul rdx
mov rdx,10000000
cmp rax,rdx
jl malloc_next
mov rax,r8
malloc_next:
mov rax,50000000
mov [initial_dynamic_length],rax
;__________
; Output buffer #1
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #2
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+8],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #3
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+16],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #4
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+24],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
; __________
; Write variables to assigned registers
lea rdi,[rel list_of_results_ptr]
mov r15,qword[rdi]
mov r14,[list_of_results_ctr]
mov r13,[list_of_results_length]
lea rdi,[rel numbers_ptr]
mov r12,qword[rdi]
mov r11,[numbers_ctr]
mov r10,[numbers_length]
mov r9,[num]
movsd xmm15,[num_float]
movsd xmm14,[result]
movsd xmm13,[i]
mov r8,[loop_counter_401]
; __________
call Init_Cores_fn
exit_label_for_Main_Entry_fn:
pop r8
movsd xmm13,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
;_____________
DupThreadInfo:
mov rdi,ThreadInfo2
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
add rax,[Division_Size]
mov [Division_Start],rax
sub rax,8
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+8]
mov [rdi+24],rax
mov rax,8
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; 56 is above
mov rax,8
mov [rdi+64],rax
mov rax,8 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
; _____
mov rdi,ThreadInfo3
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
mov rbx,[Division_Size]
add rax,rbx
mov [Division_Start],rax
sub rax,8
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+16]
mov [rdi+24],rax
mov rax,16
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; [rdi+56] is above
mov rax,16
mov [rdi+64],rax
mov rax,16 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
mov rdi,ThreadInfo4
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
mov rax,[numbers_length] ; final segment goes to the end
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+24]
mov [rdi+24],rax
mov rax,24
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; [rdi+56] is above
mov rax,24
mov [rdi+64],rax
mov rax,24 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
ret

Related

nasm x86-64, problem with word comparison when exceeding value

I have a counter stocked in a word used for a loop, to end my loop I use :
cmp word [counter], nbIter
jge end
But for some reason it only works for nbIter lesser than 0x8000.
When exceeding this number it jumps to the end at the first iteration.
I tried using two bytes for the counter and comparing the second one whith 0x80 but it was the same result. I still considered the counter as a word when increasing it, I don't know if it was like not doing any change at all.
Here is my full code :
SECTION .data
message: db 13, '0', '0', '.', '0', '0', '0'
msglen: equ $-message
compteur: dw 0
timeval: ; struct needed to call nanosleep system call
dq 0 ; seconds, dq means "define quadwords" = integers on 8 bytes
dq 1000000 ; nanoseconds
SECTION .text
GLOBAL _start
_start:
jmp loop
loop:
call resetreg
mov cx, 10 ; diviseur
mov rbx, 7
mov ax, [compteur]
offset:
dec rbx
cmp rbx, 3
je offset
xor rdx, rdx
div cx ; quotient dans ax, reste dans dx
add dl, '0'
mov [rbx+message], dl
cmp ax, 0
jg offset
; push les registres qu'on risque de modifier
call print
call pause
; pop les registres qu'on a push pour récup leur valeurs
inc word [compteur]
cmp word [compteur], 0x7fff
jge end
jmp loop
end:
call skipline
mov rax, 60 ; system call for exit
mov rdi, 0 ; exit code 0, equiv to xor rdi, rdi
syscall ; invoke operating system to exit
pause:
mov rax, 35 ; syscall nanosleep for x86_64, see man nanosleep
mov rdi, timeval ; pointing to struct encoding duration of sleep
mov rsi, 0 ; null means 2nd parameter not used
syscall
ret
print:
mov rax, 1 ; system call for write
mov rdi, 1 ; file handle 1 is stdout
mov rsi, message ; address of string to output
mov rdx, msglen ; number of bytes
syscall ; invoke operating system to do the write
ret
resetreg:
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
ret
skipline:
mov word [message+1], 0x0a00
mov dword [message+3], 0x00000000
call print
ret

jge is a jump for a signed comparison, you're probably looking for jae for unsigned comparison

lock cmpxchg fails to execute threads in core order

The following 64-bit NASM code uses lock cmpxchg to take each core in core order, execute some code, then reset the core number variable using xchg so the next core can execute the code. The core number for each core is stored in rbx -- the four cores are numbered 0, 8, 16 and 24. The variable [spin_lock_core] starts at zero and when each core is finished it updates the core number by 8 at the final line xchg [spin_lock_core],rax.
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
But before the code reaches xchg [spin_lock_core],rax the first core loops out of the program (jmp label_899), which should cause the other threads to freeze because they would be waiting for the [spin_lock_core] var to be updated, which never happens. But instead all four cores are written to the output array extra_test_array, which is displayed on the terminal when the program exits. In other words, this fails to stop the cores until the core number is updated.
The full, minimal code is below (as minimal as NASM can be in this case). The code is written for a shared object, and it's reproducible if it gets an input array (as written it doesn't matter if the input array is int or float):
; Header Section
[BITS 64]
[default rel]
global Main_Entry_fn
extern pthread_create, pthread_join, pthread_exit, pthread_self, sched_getcpu
global FreeMem_fn
extern malloc, realloc, free
extern sprintf
section .data align=16
X_ctr: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
XMM_Stack: dq 0, 0, 0, 0, 0, 0, 0
ThreadID: dq 0
X_ptr: dq 0
X_length: dq 0
X: dq 0
collect_ptr: dq 0
collect_length: dq 0
collect_ctr: dq 0
even_squares_list_ptrs: dq 0, 0, 0, 0
even_squares_list_ctr: dq 0
even_squares_list_length: dq 0
Number_Of_Cores: dq 32
pthread_attr_t: dq 0
pthread_arg: dq 0
Join_Ret_Val: dq 0
tcounter: dq 0
sched_getcpu_array: times 4 dq 0
ThreadIDLocked: dq 0
spin_lock_core: dq 0
extra_test_array: dq 0
; __________
section .text
Init_Cores_fn:
; _____
; Create Threads
label_0:
mov rdi,ThreadID ; ThreadCount
mov rsi,pthread_attr_t ; Thread Attributes
mov rdx,Test_fn ; Function Pointer
mov rcx,pthread_arg
call pthread_create wrt ..plt
mov rdi,[ThreadID] ; id to wait on
mov rsi,Join_Ret_Val ; return value
call pthread_join wrt ..plt
mov rax,[tcounter]
add rax,8
mov [tcounter],rax
mov rbx,[Number_Of_Cores]
cmp rax,rbx
jl label_0
; _____
jmp label_900 ; All threads return here, and exit
; ______________________________________
Test_fn:
; Get the core number
call sched_getcpu wrt ..plt
mov rbx,8 ; multiply by 8
mul rbx
push rax
pop rax
mov rbx,rax
push rax
Spin_lock:
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
;__________
label_899:
pop rax
ret
; __________
label_900:
mov rdi,extra_test_array ;audit_array
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
;The pointer is passed back in rcx (of course)
sub rsp,40
call free wrt ..plt
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
push r8
xor r8,r8
movsd [XMM_Stack+0],xmm13
movsd [XMM_Stack+8],xmm12
movsd [XMM_Stack+16],xmm11
movsd [XMM_Stack+24],xmm15
movsd [XMM_Stack+32],xmm14
movsd [XMM_Stack+40],xmm10
mov [X_ptr],rdi
mov [data_master_ptr],rsi
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; Write variables to assigned registers
mov r15,0
lea rdi,[rel collect_ptr]
mov r14,qword[rdi]
mov r13,[collect_ctr]
mov r12,[collect_length]
lea rdi,[rel X_ptr]
mov r11,qword[rdi]
mov r10,[X_length]
; __________
call Init_Cores_fn
movsd xmm10,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
movsd xmm11,[XMM_Stack+24]
movsd xmm12,[XMM_Stack+32]
movsd xmm13,[XMM_Stack+40]
pop r8
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
The instruction "lock cmpxchg" should fail until the [spin_lock_core] variable is updated, but it doesn't do that.
Thanks for any help in understanding why lock cmpxchg doesn't prevent the cores after core zero from firing in this area of code.
UPDATE: other research shows that xor rax,rax is needed at the top of the Spin_lock: section. When I insert that line, it reads like this:
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
With that change it freezes, as expected. But when I remove the line jmp label_899 it still freezes, but it shouldn't do that.
EDIT 122219:
Based on the comments on this question yesterday, I revised the spinlock code to (1) eliminate atomic operations in favor of faster mov and cmp instructions, (2) assign a unique memory location to each core, and (3) separate the memory locations by > 256 bytes to avoid memory on the same cache line.
Each core's memory location will be changed to 1 when the previous core is finished. When each core finishes, it sets its own memory location back to 0.
The code successfully executes core 0 IF I have all other cores loop out before the spinlock. When I let all four cores run through the spinlock, the program again hangs.
I've verified that each separate memory location is set to 1 when the previous core is finished.
Here's the updated spinlock section:
section .data
spin_lock_core: times 140 dq 0
spin_lock_core_offsets: dq 0,264,528,792
section .text
; Calculate the offset to spin_lock_core
mov rbp,spin_lock_core
mov rdi,spin_lock_core_offsets
mov rax,[rdi+rbx]
add rbp,rax
; ________
Spin_lock:
pause
cmp byte[rbp],1
jnz Spin_lock
xor rax,rax
mov [rbp],rax ; Set current memory location to zero
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rdx
mov rcx,rax
; Loop out if this is the last core
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge label_899
; Set next core to 1 by adding 264 to the base address
add rbp,264
mov rax,1
mov [rbp],rax
Why does this code still hang?

I don't think you should use cmpxchg for this at all. Try this:
Spin_lock:
pause
cmp [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
lea rax,[rbx+8]
mov [spin_lock_core],rax

I solved this spinlock problem, but after Peter Cordes' comment below I see that it is not correct. I won't delete this answer because I hope it can lead to the solution.
I use lock cmpxchg [rbp+rbx],rbx, which assembles without error, but the NASM assembler should return a "invalid combination of operands" error because the source operand can only be rax, so it shouldn't assemble with any other register. I also note that the online resources (for example, https://www.felixcloutier.com/x86/cmpxchg) show the format as CMPXCHG r/m64,r64, but the source operand can't be any r64 -- it must be rax, as that entry goes on to say.
Without the "mov rax,rbx" line it works because on the first iteration the rax register is set to 0 which matches the memory location. On the second iteration it succeeds by default.
When I add "mov rax,rbx" -- which resets rax -- the program once again hangs. I would really appreciate any ideas on why this program should hang as written.
At the start of this block rbx is the core number:
section .data
spin_lock_core: times 4 dq 0
section .text
[ Code leading up to this spinlock section shown above ]
mov rbp,spin_lock_core
Spin_lock:
pause
mov rax,rbx
lock cmpxchg [rbp+rbx],rax
jnz Spin_lock
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge spin_lock_out
xchg [rbp+rax],rax
spin_lock_out:
The differences from my original post are:
Each core spins on (and reads from) its own unique memory location.
I use the "pause" instruction on the spinlock.
Each unique memory location is updated in core order.
But it does not work when I include mov rax,rbx. Intuitively that should work, so I will really appreciate any ideas on why it doesn't in this case.

Itoa assembly implementation, div operation causes segfault? [duplicate]

This question already has answers here:
8086 assembly on DOSBox: Bug with idiv instruction?
(1 answer)
Why should EDX be 0 before using the DIV instruction?
(2 answers)
Closed 4 years ago.
So I'm trying to implement itoa, which converts an int into a string.
So far, the implementation is working if I don't loop in the .loop section, and stick to small numbers. As soon as it loops, my program segfaults.
Here is the code:
section .data
buffer times 11 db 0
section .text
global ft_itoa
extern ft_strrevd
extern malloc
ft_itoa:
mov rcx, 1 ;initialize our counter at 1 for the terminating null byte
mov rax, rdi ;move number in RAX for DIV instruction
push rbx ;save RBX
mov bl, 10
.check_negative:
and edi, 0xf0000000
mov rdi, buffer
jz .loop ;number is positive, proceed to main loop
not rax ;else
inc rax ;compute absolute value with binary complement
mov r9, 1 ;set neg flag
.loop:
cmp rax, 0
jz .check_neg_flag
div bl
add ah, 48 ;convert int to char
mov byte[rdi + rcx - 1], ah ;copy char in buffer
sub ah, 48
inc rcx
jmp .loop ;commenting this line prevents crash
.check_neg_flag:
cmp r9, 1
jne .dup
mov byte[rdi + rcx - 1], '-'
inc rcx
.dup:
mov byte[rdi + rcx - 1], 0
call ft_strrevd ;copy buffer string in memory and return pointer
.end:
pop rbx ;restore RBX
ret
It's most likely caused by the div, but I'm having trouble understanding how it works.
If anyone could point me towards a solution it'd be highly appreciated.

So in order to fix this I have to use div ebx instead of div bl, and xor edx, edx before each div.
Here is a working version:
section .data
buffer times 11 db 0
nega db "neg",0
posi db "pos",0
section .text
global ft_itoa
extern ft_strdup
extern malloc
ft_itoa:
xor rcx, rcx ;initialize counter
xor r9, r9 ;set neg flag to 0
mov eax, edi ;move number in RAX for DIV instruction
push rbx ;save RBX
mov ebx, 10
.check_negative:
and edi, 0x80000000
mov rdi, buffer
jz .divide ;number is positive, proceed to main loop
not eax ;else
inc eax ;compute absolute value with binary complement
inc r9 ;set neg flag
.divide:
xor edx, edx
div ebx
add edx, 48 ;convert int to char
push rdx
inc rcx
cmp eax, 0
jnz .divide
.check_neg_flag:
cmp r9, 1
jne .buff_string
mov byte[rdi], '-'
.buff_string:
pop rdx
mov byte[rdi + r9], dl
dec rcx
inc r9
cmp rcx, 0
jnz .buff_string
.dup:
mov byte[rdi + r9], 0
call ft_strdup ;copy buffer string in memory and return pointer
pop rbx ;restore RBX
ret

Printing binary string in assembly

I'm writing a program to print binary string of a hardcoded word. Here is how it looks like currently:
main.asm
section .text
global _start
extern _print_binary_content
_start:
push word [word_to_print] ; pushing word. Can we push just one byte?
call _print_binary_content
mov rax, 60
mov rdi, 0
syscall
section .data
word_to_print: dw 0xAB0F
printer.asm
SYS_BRK_NUM equ 0x0C
BITS_IN_WORD equ 0x10
SYS_WRITE_NUM equ 0x01
STD_OUT_FD equ 0x01
FIRST_BIT_BIT_MASK equ 0x01
ASCII_NUMBER_OFFSET equ 0x30
section .text
global _print_binary_content
_print_binary_content:
pop rbp
xor ecx, ecx ;zeroing rcx
xor ebx, ebx ;zeroing rbx
pop bx ;the word to print the binary content of
;sys_brk for current location
mov rax, SYS_BRK_NUM
mov rdi, 0
syscall
;end sys_brk
mov r12, rax ;save the current brake location
;sys_brk for memory allocation 16 bytes
lea rdi, [rax + BITS_IN_WORD]
mov rax, SYS_BRK_NUM
syscall
;end sys_brk
xor ecx, ecx
mov cl, byte BITS_IN_WORD - 1; used as a counter in the loop below
loop:
mov dx, bx
and dx, FIRST_BIT_BIT_MASK
add dx, ASCII_NUMBER_OFFSET
mov [r12 + rcx], dl
shr bx, 0x01
dec cl
cmp cl, 0
jge loop
mov rsi, r12
mov rax, SYS_WRITE_NUM
mov rdi, STD_OUT_FD
mov rdx, BITS_IN_WORD
syscall
push rbp ; pushing return address back
ret
If I compile link and run this program it works. But the question is about performance and maybe conventions of writing assembly programs. In the file printer.asm I cleaned ecx twice which looks kind of not optimal. Maybe some registers were used not by their purpose (I used intel-manual).
Can you please help me to improve this very simple program?

Printing a number in x86-64 assembly

Okay, all I am trying to do is print a number (up to 18446744073709551616) in x86-64 assembly for Linux. Can anyone please tell me why this program will not work? All that happens is that it runs and exits. Thank you for all the help you can give!
GLOBAL _start
SECTION .text
;PRINTCHAR
; MOV [LETTER],RAX
;
; MOV RAX,1
; MOV RDI,1
; MOV RSI,LETTER
; MOV RDX,1
; SYSCALL
; RET
PRINTDEC:
MOV R9,18 ;SO IT CAN POINT TO THE END OF THE BUFFER
MOV R10,0
START:
MOV R8,NUMBER
MOV RDX,0 ;CLEAR OUT RDX TO AVOID ERRORS
MOV RBX,10 ;WHAT TO DIVIDE BY
DIV RBX ;DIVIDE OUR NUMBER BY TEN
CMP RAX,0 ;IF OUR QUOTENT IS ZERO THEN WE ARE DONE, PRINT THE BUFFER
JE END
JMP ADDBUF
ADDBUF:
ADD R8,R9 ;MOV TO THE CURRENT LOCATION IN OUR BUFFER
ADD RDX,0x30
; ADD R8,R10
MOV [R8],RDX ;MOV THE LAST NUMBER IN OUR BUFFER TO RDX
DEC R9
INC R10
JMP START
END:
ADD R8,R9 ;add the very last digit
MOV [R8],RDX
INC R10
MOV RAX,1
MOV RDI,1
MOV RSI,R8
MOV RDX,R10
SYSCALL
RET
_start:
MOV RAX,55
CALL PRINTDEC
MOV RAX,60
MOV RDI,0
SYSCALL
SECTION .bss
LETTER: RESB 1
NUMBER: RESB 19

PRINTDEC:
LEA R9, [NUMBER + 18] ; last character of buffer
MOV R10, R9 ; copy the last character address
MOV RBX, 10 ; base10 divisor
DIV_BY_10:
XOR RDX, RDX ; zero rdx for div
DIV RBX ; rax:rdx = rax / rbx
ADD RDX, 0x30 ; convert binary digit to ascii
TEST RAX,RAX ; if rax == 0 exit DIV_BY_10
JZ LAST_REMAINDER
MOV byte [R9], DL ; save remainder
SUB R9, 1 ; decrement the buffer address
JMP DIV_BY_10
LAST_REMAINDER:
TEST DL, DL ; if DL (last remainder) != 0 add it to the buffer
JZ CHECK_BUFFER
MOV byte [R9], DL ; save remainder
SUB R9, 1 ; decrement the buffer address
CHECK_BUFFER:
CMP R9, R10 ; if the buffer has data print it
JNE PRINT_BUFFER
MOV byte [R9], '0' ; place the default zero into the empty buffer
SUB R9, 1
PRINT_BUFFER:
ADD R9, 1 ; address of last digit saved to buffer
SUB R10, R9 ; end address minus start address
ADD R10, 1 ; R10 = length of number
MOV RAX, 1 ; NR_write
MOV RDI, 1 ; stdout
MOV RSI, R9 ; number buffer address
MOV RDX, R10 ; string length
SYSCALL
RET

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string