lock cmpxchg fails to execute threads in core order

lock cmpxchg fails to execute threads in core order - multithreading

The following 64-bit NASM code uses lock cmpxchg to take each core in core order, execute some code, then reset the core number variable using xchg so the next core can execute the code. The core number for each core is stored in rbx -- the four cores are numbered 0, 8, 16 and 24. The variable [spin_lock_core] starts at zero and when each core is finished it updates the core number by 8 at the final line xchg [spin_lock_core],rax.
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
But before the code reaches xchg [spin_lock_core],rax the first core loops out of the program (jmp label_899), which should cause the other threads to freeze because they would be waiting for the [spin_lock_core] var to be updated, which never happens. But instead all four cores are written to the output array extra_test_array, which is displayed on the terminal when the program exits. In other words, this fails to stop the cores until the core number is updated.
The full, minimal code is below (as minimal as NASM can be in this case). The code is written for a shared object, and it's reproducible if it gets an input array (as written it doesn't matter if the input array is int or float):
; Header Section
[BITS 64]
[default rel]
global Main_Entry_fn
extern pthread_create, pthread_join, pthread_exit, pthread_self, sched_getcpu
global FreeMem_fn
extern malloc, realloc, free
extern sprintf
section .data align=16
X_ctr: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
XMM_Stack: dq 0, 0, 0, 0, 0, 0, 0
ThreadID: dq 0
X_ptr: dq 0
X_length: dq 0
X: dq 0
collect_ptr: dq 0
collect_length: dq 0
collect_ctr: dq 0
even_squares_list_ptrs: dq 0, 0, 0, 0
even_squares_list_ctr: dq 0
even_squares_list_length: dq 0
Number_Of_Cores: dq 32
pthread_attr_t: dq 0
pthread_arg: dq 0
Join_Ret_Val: dq 0
tcounter: dq 0
sched_getcpu_array: times 4 dq 0
ThreadIDLocked: dq 0
spin_lock_core: dq 0
extra_test_array: dq 0
; __________
section .text
Init_Cores_fn:
; _____
; Create Threads
label_0:
mov rdi,ThreadID ; ThreadCount
mov rsi,pthread_attr_t ; Thread Attributes
mov rdx,Test_fn ; Function Pointer
mov rcx,pthread_arg
call pthread_create wrt ..plt
mov rdi,[ThreadID] ; id to wait on
mov rsi,Join_Ret_Val ; return value
call pthread_join wrt ..plt
mov rax,[tcounter]
add rax,8
mov [tcounter],rax
mov rbx,[Number_Of_Cores]
cmp rax,rbx
jl label_0
; _____
jmp label_900 ; All threads return here, and exit
; ______________________________________
Test_fn:
; Get the core number
call sched_getcpu wrt ..plt
mov rbx,8 ; multiply by 8
mul rbx
push rax
pop rax
mov rbx,rax
push rax
Spin_lock:
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
;__________
label_899:
pop rax
ret
; __________
label_900:
mov rdi,extra_test_array ;audit_array
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
;The pointer is passed back in rcx (of course)
sub rsp,40
call free wrt ..plt
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
push r8
xor r8,r8
movsd [XMM_Stack+0],xmm13
movsd [XMM_Stack+8],xmm12
movsd [XMM_Stack+16],xmm11
movsd [XMM_Stack+24],xmm15
movsd [XMM_Stack+32],xmm14
movsd [XMM_Stack+40],xmm10
mov [X_ptr],rdi
mov [data_master_ptr],rsi
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; Write variables to assigned registers
mov r15,0
lea rdi,[rel collect_ptr]
mov r14,qword[rdi]
mov r13,[collect_ctr]
mov r12,[collect_length]
lea rdi,[rel X_ptr]
mov r11,qword[rdi]
mov r10,[X_length]
; __________
call Init_Cores_fn
movsd xmm10,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
movsd xmm11,[XMM_Stack+24]
movsd xmm12,[XMM_Stack+32]
movsd xmm13,[XMM_Stack+40]
pop r8
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
The instruction "lock cmpxchg" should fail until the [spin_lock_core] variable is updated, but it doesn't do that.
Thanks for any help in understanding why lock cmpxchg doesn't prevent the cores after core zero from firing in this area of code.
UPDATE: other research shows that xor rax,rax is needed at the top of the Spin_lock: section. When I insert that line, it reads like this:
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
With that change it freezes, as expected. But when I remove the line jmp label_899 it still freezes, but it shouldn't do that.
EDIT 122219:
Based on the comments on this question yesterday, I revised the spinlock code to (1) eliminate atomic operations in favor of faster mov and cmp instructions, (2) assign a unique memory location to each core, and (3) separate the memory locations by > 256 bytes to avoid memory on the same cache line.
Each core's memory location will be changed to 1 when the previous core is finished. When each core finishes, it sets its own memory location back to 0.
The code successfully executes core 0 IF I have all other cores loop out before the spinlock. When I let all four cores run through the spinlock, the program again hangs.
I've verified that each separate memory location is set to 1 when the previous core is finished.
Here's the updated spinlock section:
section .data
spin_lock_core: times 140 dq 0
spin_lock_core_offsets: dq 0,264,528,792
section .text
; Calculate the offset to spin_lock_core
mov rbp,spin_lock_core
mov rdi,spin_lock_core_offsets
mov rax,[rdi+rbx]
add rbp,rax
; ________
Spin_lock:
pause
cmp byte[rbp],1
jnz Spin_lock
xor rax,rax
mov [rbp],rax ; Set current memory location to zero
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rdx
mov rcx,rax
; Loop out if this is the last core
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge label_899
; Set next core to 1 by adding 264 to the base address
add rbp,264
mov rax,1
mov [rbp],rax
Why does this code still hang?

I don't think you should use cmpxchg for this at all. Try this:
Spin_lock:
pause
cmp [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
lea rax,[rbx+8]
mov [spin_lock_core],rax

I solved this spinlock problem, but after Peter Cordes' comment below I see that it is not correct. I won't delete this answer because I hope it can lead to the solution.
I use lock cmpxchg [rbp+rbx],rbx, which assembles without error, but the NASM assembler should return a "invalid combination of operands" error because the source operand can only be rax, so it shouldn't assemble with any other register. I also note that the online resources (for example, https://www.felixcloutier.com/x86/cmpxchg) show the format as CMPXCHG r/m64,r64, but the source operand can't be any r64 -- it must be rax, as that entry goes on to say.
Without the "mov rax,rbx" line it works because on the first iteration the rax register is set to 0 which matches the memory location. On the second iteration it succeeds by default.
When I add "mov rax,rbx" -- which resets rax -- the program once again hangs. I would really appreciate any ideas on why this program should hang as written.
At the start of this block rbx is the core number:
section .data
spin_lock_core: times 4 dq 0
section .text
[ Code leading up to this spinlock section shown above ]
mov rbp,spin_lock_core
Spin_lock:
pause
mov rax,rbx
lock cmpxchg [rbp+rbx],rax
jnz Spin_lock
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge spin_lock_out
xchg [rbp+rax],rax
spin_lock_out:
The differences from my original post are:
Each core spins on (and reads from) its own unique memory location.
I use the "pause" instruction on the spinlock.
Each unique memory location is updated in core order.
But it does not work when I include mov rax,rbx. Intuitively that should work, so I will really appreciate any ideas on why it doesn't in this case.

Related

gdb: set scheduler_locking on always causes deadlock

I am using gdb to debug a shared object loaded by a C program using dlopen() and dlsym(). The shared object is written in NASM, and it's compiled with DWARF debug symbols. I'm running on Ubuntu 18.04.
I want each thread to stop completely after it has been created so I can have all four threads created before I continue. So I start the debug session with "set scheduler-locking on" or "set scheduler-locking step."
As I understand the gdb command "set scheduler-locking on," it should allow me to create a thread, then switch back to thread 1 (the main thread) and create another thread, and so on until all threads have been created.
In the NASM code, I set a breakpoint at the label Test_fn: (see the code below). When a thread reaches that breakpoint, it stops (which is the first breakpoint after thread creation). Then I switch to back thread 1 (the main thread) and "continue" to instantiate the next thread (the main thread is still in the label_0 section of Init_Cores). Thread 1 will execute code (if I single-step through it), but at some point before the next thread is created it deadlocks, and it always deadlocks.
I have also tried single-stepping through thread creation (avoiding "continue"), but it still deadlocks.
According to the answer from Employed Russian at How to continue one thread at a time when debugging a multithreaded program in GDB?, that can happen but he doesn't say that it will always happen. Whether I use "set scheduler-locking on" or "set scheduler-locking step" I get a deadlock.
Maybe this is because it's a shared object, but everything else works with the shared object, so I don't think that's the problem.
Here is the NASM code. The same question would apply to C or C++ code, specifically why gdb deadlocks when I reach a breakpoint on a new thread, then switch back to thread 1 to continue creating threads. Thread 2 (the first created thread) should stop when scheduler-locking is on.
; Header Section
[BITS 64]
[default rel]
global Main_Entry_fn
extern pthread_create, pthread_join, pthread_exit, pthread_self, sched_getcpu
global FreeMem_fn
extern malloc, realloc, free
extern sprintf
section .data align=16
X_ctr: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
XMM_Stack: dq 0, 0, 0, 0, 0, 0, 0
ThreadID: dq 0
X_ptr: dq 0
X_length: dq 0
X: dq 0
collect_ptr: dq 0
collect_length: dq 0
collect_ctr: dq 0
even_squares_list_ptrs: dq 0, 0, 0, 0
even_squares_list_ctr: dq 0
even_squares_list_length: dq 0
Number_Of_Cores: dq 4
pthread_attr_t: dq 0
pthread_arg: dq 0
Join_Ret_Val: dq 0
tcounter: dq 0
sched_getcpu_array: times 4 dq 0
ThreadIDLocked: dq 0
spin_lock_core: times 4 dq 0
extra_test_array: times 4 dq 0
spin_lock_iter: times 4 dq 0
spin_lock_base_addr: dq 0
; __________
section .text
Init_Cores_fn:
%include "/opt/P01_SH/_Include_Utilities/Buffer_Pointer_Arrays.asm"
mov rax,[Number_Of_Cores]
mov rbx,8
mul rbx
mov [Number_Of_Cores],rax
; _____
; Create Threads
label_0:
; THREAD 1 WORKS IN THIS SECTION TO CREATE THREADS
mov rdi,ThreadID ; ThreadCount
mov rsi,pthread_attr_t ; Thread Attributes
mov rdx,Test_fn ; Function Pointer
mov rcx,pthread_arg
call pthread_create wrt ..plt
mov rdi,[ThreadID] ; id to wait on
mov rsi,Join_Ret_Val ; return value
call pthread_join wrt ..plt
mov rax,[tcounter]
add rax,8
mov [tcounter],rax
mov rbx,[Number_Of_Cores]
cmp rax,rbx
jl label_0
; _____
jmp label_900 ; All threads return here, and exit
; ______________________________________
Test_fn:
; Get the core number
call sched_getcpu wrt ..plt
mov rbx,8 ; multiply by 8
mul rbx
push rax
pop rax
mov rbx,rax
push rax
Next_Stop: ; THIS IS WHERE EACH THREAD STOPS
mov rdi,extra_test_array
mov [rdi+rbx],rbx
jmp label_899 ; ******************
;__________
label_899:
pop rax
ret
; __________
label_900:
%include "/opt/P01_SH/_Include_Utilities/Sys_Close_Include.asm"
mov rdi,extra_test_array
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
;The pointer is passed back in rcx (of course)
sub rsp,40
call free wrt ..plt
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
push r8
xor r8,r8
movsd [XMM_Stack+0],xmm13
movsd [XMM_Stack+8],xmm12
movsd [XMM_Stack+16],xmm11
movsd [XMM_Stack+24],xmm15
movsd [XMM_Stack+32],xmm14
movsd [XMM_Stack+40],xmm10
mov [X_ptr],rdi
mov [data_master_ptr],rsi
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; Write variables to assigned registers
mov r15,0
lea rdi,[rel collect_ptr]
mov r14,qword[rdi]
mov r13,[collect_ctr]
mov r12,[collect_length]
lea rdi,[rel X_ptr]
mov r11,qword[rdi]
mov r10,[X_length]
; __________
call Init_Cores_fn
movsd xmm10,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
movsd xmm11,[XMM_Stack+24]
movsd xmm12,[XMM_Stack+32]
movsd xmm13,[XMM_Stack+40]
pop r8
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
;__________
The NASM code is a long listing, but the focus is on label_0 (where threads are created) and Test_fn (the first breakpoint that a new thread reaches.
I would really appreciate any input to this problem with gdb scheduler locking. Thanks.

Your assembly program appears to do this:
pthread_create(&ThreadID, ..., Test_fn, ...);
pthread_join(ThreadID, ...);
First, this doesn't actually give you any parallelism, as you create and immediately wait for the thread you just created. You could just as easily call Test_fn() directly, with less overhead.
Second, if you stop the newly-created thread (as you do with scheduler-locking), then your main thread will block waiting to join that newly-created thread, and it will block forever. So of course your program will deadlock.
Update:
I thought I need to call pthread_join right after pthread_create so all threads will finish before the main thread exits.
That is correct. But they way to do that generally looks like this:
const int NThreads = ...;
pthread_t tids[NThreads];
for (int j = 0; j < NThreads; j++)
pthread_create(&tids[j], ...);
// All threads have started, and are now running in parallel with the main thread.
// Wait for them to finish.
for (int j = 0; j < NThreads; j++)
pthread_join(tids[j], ...);
// All done.
return 0;

Windows multicore program worked before but suddenly threads execute randomly

Below is a multithreaded (4-core) NASM 64 program. It's not minimal, but it's complete; I posted the complete code because a minimal example may not reveal the problem in this code.
This is my first multithreaded multicore program in NASM, and I was happy to see yesterday afternoon that it worked correctly (all four cores) and returned the values I was expecting. I ran it successfully several times yesterday and again several times this morning.
Suddenly, about 10 minutes after my last run, without making any changes to the code, without re-assembling the dll, I ran it again and this time the threads execute randomly -- not all threads execute, and the configuration of threads that execute varies from run to run -- sometimes thread 1 only; sometimes threads 2-4 but not thread 1, sometimes only thread 2 and 4, etc.
My code did not have ExitThread or CloseHandle calls, so I added them in (see label_899:), rebooted and ran it again. The first run after a fresh reboot still shows random execution of threads.
I thought the problem may be that CreateThread was failing, so I also returned the thread handles. Every time, all four thread handles were created even though all did not execute, so that's not it.
I have done a lot of research but I haven't seen this issue discussed at all. Even though this is NASM, the same thing could also be applicable to C or C++ multithreading. I use WaitForMultipleObjects to wait on all threads to complete.
This is a dll, and the entry point is Main_Entry_fn. The dll is called from Python using ctypes, but I doubt that makes any difference. I use ctypes frequently and have never had a problem with it.
Thanks for any ideas.
P.S. ignore the realloc code because the buffers are large enough that realloc is never called.
UPDATE: Here is how I constructed the progam flow: the entry point is Main_Entry_fn toward the bottom of the program listing (it's an export because it has to be exported for a dll call). Main_Entry_fn takes the input data from the calling function, preserves registers (because the whole program is register optimized) and allocates output buffers. Main_Entry_fn then calls Init_Cores_fn (top of program listing), where we create four arrays with data for the registers on entry to each core (remember all vars are stored in registers -- see the call to DupThreadInfo), and creates four threads in a loop. The threads all call Prime_Number_fn, and the threads are set to start immediately upon creation.
Register assignments:
r15 list_of_results_ptr (pointer to list_of_results)
r14 list_of_results_ctr (counter for list_of_results)
r13 list_of_results_length
r12 numbers_ptr (pointer to "numbers" input array)
r11 numbers_ctr
r10 numbers_length
r9 num
r8 i
xmm15 num_float
xmm14: result
; Header Section
[BITS 64]
[default rel]
extern malloc, calloc, realloc, free
global Main_Entry_fn
export Main_Entry_fn
global FreeMem_fn
export FreeMem_fn
extern CreateThread, CloseHandle, ExitThread
extern WaitForMultipleObjects, GetCurrentThread
section .data align=16
Return_Pointer_Array: dq 0, 0, 0
Input_Length_Array: dq 0, 0,
list_of_results_ptr: dq 0
list_of_results_ctr: dq 0
list_of_results_length: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
internal_dynamic_length: dq 5000
XMM_Stack: dq 0, 0, 0
numbers_ptr: dq 0
numbers_length: dq 0
numbers: dq 0
list_of_results: dq 0
num_float: dq 0.0
loop_counter_401: dq 0
numbers_ctr: dq 0
num: dq 0
result: dq 0
const_1: dq 1
i: dq 0
range_loop_start_i: dq 0
range_loop_end_i: dq 0
const_0: dq 0
; New vars for threads:
ThreadCount: times 4 dq 0
ThreadInfo: times 10 dq 0
ThreadInfo2: times 10 dq 0
ThreadInfo3: times 10 dq 0
ThreadInfo4: times 10 dq 0
TestInfo: times 4 dq 0
ThreadHandles: times 4 dq 0
Return_Data_Array: times 8 dq 0 ; ptr,ptr,ptr,ptr,length,length,length,length
StartByte: dq 0
stride: dq 8
output_buffer_pointers: times 4 dq 0 ; for malloc
Division_Size: dq 0
Division_Start: dq 0
section .text
; ______________________________________
Init_Cores_fn:
; Calculate the data divisions
mov rax,r10
mov rbx,4 ;cores
xor rdx,rdx
div rbx
mov [Division_Size],rax
mov [Division_Start],rax
; Populate the ThreadInfo array with vars to pass
; ThreadInfo: length, startbyte, stride, vars into registers on entry to each core
mov rdi,ThreadInfo
mov rax,0 ;ThreadInfoLength
mov [rdi],rax ; length (number of vars into registers plus 3 elements)
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp]
mov [rdi+24],rax ;for r15
mov [rdi+32],r14
mov [rdi+40],r13
mov [rdi+48],r12
mov rax,[Division_Start]
mov [rdi+56],rax
mov rax,0
mov [rdi+64],rax
call DupThreadInfo
mov rbp,rsp ; preserve caller's stack frame
sub rsp,56 ; Shadow space (was 32)
; _____
label_0:
mov rax,[StartByte]
cmp rax,0
jne sb2
mov rdi,ThreadInfo
jmp sb5
sb2:cmp rax,8
jne sb3
mov rdi,ThreadInfo2
jmp sb5
sb3:cmp rax,16
jne sb4
mov rdi,ThreadInfo3
jmp sb5
sb4:cmp rax,24
jne sb5
mov rdi,ThreadInfo4
sb5:
; _____
; Create Threads
mov rcx,0 ; lpThreadAttributes (Security Attributes)
mov rdx,0 ; dwStackSize
mov r8,PrimeNumber_fn ; lpStartAddress (function pointer)
mov r9,rdi ; lpParameter (array of data passed to each core)
mov rax,0
mov [rsp+32],rax ; use default creation flags
mov rdi,ThreadCount
mov [rsp+40],rdi ; ThreadID
call CreateThread
; Move the handle into ThreadHandles array (returned in rax)
mov rdi,ThreadHandles
mov rcx,[StartByte]
mov [rdi+rcx],rax
mov rax,[StartByte]
add rax,8
mov [StartByte],rax
mov rbx,32 ; Four cores
cmp rax,rbx
jl label_0
; _____
; Wait
mov rcx,4 ;rax ; number of handles
mov rdx,ThreadHandles ; pointer to handles array
mov r8,0 ; wait for all threads to complete
mov r9,5000 ; milliseconds to wait
call WaitForMultipleObjects
; _____
mov rsp,rbp
jmp label_900
; ______________________________________
PrimeNumber_fn:
; Populate registers
;(note: rcx is the return value for ThreadProc)
mov rdi,rcx
mov rax,[rdi]
mov r15,[rdi+24]
mov r13,[rdi+40]
mov r12,[rdi+48]
mov r10,[rdi+56]
xor r11,r11
xor r9,r9
mov r8,[rdi+8] ; start byte
pxor xmm15,xmm15
pxor xmm15,xmm14
pxor xmm15,xmm13
; Get the ThreadID based on startbyte
mov rax,[rdi+72] ; 0, 8, 16, 24 for 4-core
push rax
;______
label_401:
mov rdi,r12 ; Pointer
cmp r8,r10
jge label_899
movsd xmm0,qword[rdi+r8]
movsd xmm15,xmm0
add r8,8
;______
cvttsd2si rax,xmm15
mov r9,rax
;______
label_8010:
label_801:
cmp r9,[const_1]
jle label_401
;______
label_12010:
mov rax,2
sub rax,1
movq xmm13,rax
label_1201:
movq rcx,xmm13
movq xmm1,[const_1]
addsd xmm13,xmm1
movq rcx,xmm13
mov rdx,r9
cmp rcx,rdx
jge label_12020
;______
label_16010:
label_1601:
mov rax,r9
movq rbx,xmm13
xor rdx,rdx
div rbx
mov rax,rdx
mov rdx,0
cmp rax,rdx
jne label_16020
;______
movq xmm14,[const_0]
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_17
;[Irrelevant code omitted]
next_17:
;______
jmp label_401
;______
label_1602:
label_16020:
;______
mov rax,r9
cvtsi2sd xmm14,rax
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_21
;[Irrelevant code omitted]
next_21:
jmp label_1201
;______
label_1202:
label_12020:
;______
movq xmm14,[const_0]
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_25
;[Irrelevant code omitted]
next_25:
jmp label_401
;______
label_899:
pop rax
mov rdi,Return_Data_Array
mov [rdi+rax],r15
mov [rdi+rax+32],r14 ; 32 = four cores
call ExitThread
call GetCurrentThread
mov rcx,rax
call CloseHandle
ret
; __________
label_900:
exit_label_for_PrimeNumber_fn:
mov rdi,Return_Data_Array ; Final return to the calling process
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
sub rsp,40
call free
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
movsd [XMM_Stack+0],xmm15
movsd [XMM_Stack+8],xmm14
movsd [XMM_Stack+16],xmm13
push r8
xor r8,r8
mov [numbers_ptr],rcx
mov [data_master_ptr],rdx
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [numbers_length],rax
add rcx,8
; __________
; malloc for dynamic arrays
lea rdi,[data_master_ptr]
mov rbp,[rdi]
movsd xmm0,qword[rbp]
cvttsd2si rax,xmm0
mov r8,rax
mov rdx,10
mul rdx
mov rdx,10000000
cmp rax,rdx
jl malloc_next
mov rax,r8
malloc_next:
mov rax,50000000
mov [initial_dynamic_length],rax
;__________
; Output buffer #1
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #2
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+8],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #3
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+16],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #4
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+24],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
; __________
; Write variables to assigned registers
lea rdi,[rel list_of_results_ptr]
mov r15,qword[rdi]
mov r14,[list_of_results_ctr]
mov r13,[list_of_results_length]
lea rdi,[rel numbers_ptr]
mov r12,qword[rdi]
mov r11,[numbers_ctr]
mov r10,[numbers_length]
mov r9,[num]
movsd xmm15,[num_float]
movsd xmm14,[result]
movsd xmm13,[i]
mov r8,[loop_counter_401]
; __________
call Init_Cores_fn
exit_label_for_Main_Entry_fn:
pop r8
movsd xmm13,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
;_____________
DupThreadInfo:
mov rdi,ThreadInfo2
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
add rax,[Division_Size]
mov [Division_Start],rax
sub rax,8
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+8]
mov [rdi+24],rax
mov rax,8
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; 56 is above
mov rax,8
mov [rdi+64],rax
mov rax,8 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
; _____
mov rdi,ThreadInfo3
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
mov rbx,[Division_Size]
add rax,rbx
mov [Division_Start],rax
sub rax,8
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+16]
mov [rdi+24],rax
mov rax,16
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; [rdi+56] is above
mov rax,16
mov [rdi+64],rax
mov rax,16 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
mov rdi,ThreadInfo4
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
mov rax,[numbers_length] ; final segment goes to the end
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+24]
mov [rdi+24],rax
mov rax,24
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; [rdi+56] is above
mov rax,24
mov [rdi+64],rax
mov rax,24 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
ret

printing numbers in nasm

I have written an assembly code to print numbers from 1 to 9 but the code only prints 1 and no other element other than 1 is printed and only one output is received.It means that the loop is also not being run. I cant figure out what is wrong with my code.
section .bss
lena equ 1024
outbuff resb lena
section .data
section .text
global _start
_start:
nop
mov cx,0
incre:
inc cx
add cx,30h
mov [outbuff],cx
cmp cx,39h
jg done
cmp cx,39h
jl print
print:
mov rax,1 ;sys_write
mov rdi,1
mov rsi,outbuff
mov rdx,lena
syscall
jmp incre
done:
mov rax,60 ;sys_exit
mov rdi,0
syscall
My OS is 64 bit linux. this code is built using nasm with the following commands : nasm -f elf64 -g -o num.o num.asm and ld -o num num.asm

Answer rewritten after some experimentation.
There two errors in your code, and a few inefficiencies.
First, you add 0x30 to the number (to turn it from the number 1 to the ASCII 1). However, you do that increment inside the loop. As a result, your first iteration cx is 0x31, second 0x62 ("b"), third 0x93 (invalid UTf-8 sequence) etc.
Just initialize cx to 0x30 and remove the add from inside the loop.
But there's another problem. RCX is clobbered during system calls. Replacing cx with r12 causes the program to work.
In addition to that, you pass the buffer's length to write, but it only has one character. The program so far:
section .bss
lena equ 1024
outbuff resb lena
section .data
section .text
global _start
_start:
nop
mov r12,30h
incre:
inc r12
mov [outbuff],r12
cmp r12,39h
jg done
cmp r12,39h
jl print
print:
mov rax,1 ;sys_write
mov rdi,1
mov rsi,outbuff
mov rdx,1
syscall
jmp incre
done:
mov rax,60 ;sys_exit
mov rdi,0
syscall
Except even now, the code is extremely inefficient. You have two compares on the same condition, one of them branches to the very next instruction.
Also, your code would be much much much faster and smaller if you moved the breaking condition to the end of the code. Also, cx is a 16 bit register. r12 is a 64 bit register. We actually only need 8 bits. Using larger registers than needed means all of our immediates waste up space in memory and the cache. We therefor switch to the 8 bit variant of r12. After these changes, we get:
section .bss
lena equ 1024
outbuff resb lena
section .data
section .text
global _start
_start:
nop
mov r12b,30h
incre:
inc r12b
mov [outbuff],r12b
mov rax,1 ;sys_write
mov rdi,1
mov rsi,outbuff
mov rdx,1
syscall
cmp r12b,39h
jl incre
mov rax,60 ;sys_exit
mov rdi,0
syscall
There's still lots more you can do. For example, you call the write system call 9 times, instead of filling the buffer and then calling it once (despite the fact that you've allocated a 1024 bytes buffer). It will probably be faster to initialize r12 with zero (xor r12, r12) and then add 0x30. (not relevant for the 8 bit version of the register).

Insertion sort not working, 32bit assembly

I'm trying to implement insertion sort in 32bit assembly in linux using NASM and I get a segmentation fault mid-run (not to mention that for some reason 'printf' prints random garbage values, I'm not totally sure why), Here is the
code:
section .rodata
MSG: DB "welcome to sortMe, please sort me",10,0
S1: DB "%d",10,0 ; 10 = '\n' , 0 = '\0'
section .data
array DD 5,1,7,3,4,9,12,8,10,2,6,11 ; unsorted array
len DB 12
section .text
align 16
global main
extern printf
main:
push MSG ; print welcome message
call printf
add esp,4 ; clean the stack
call printArray ;print the unsorted array
;parameters
;push len
;push array
mov eax, len
mov ebx, array
push eax
push ebx
call myInsertionSort
call printArray ; print the sorted one
mov eax, 1 ;exit system call
int 0x80
printArray:
push ebp ;save old frame pointer
mov ebp,esp ;create new frame on stack
pushad ;save registers
mov eax,0
mov ebx,0
mov edi,0
mov esi,0 ;array index
mov bl, byte [len]
add edi,ebx ; edi = array size
print_loop:
cmp esi,edi
je print_end
push dword [array+esi*4]
push S1
call printf
add esp, 8 ;clean the stack
inc esi
jmp print_loop
print_end:
popa ;restore registers
mov esp,ebp ;clean the stack frame
pop ebp ;return to old stack frame
ret
myInsertionSort:
push ebp
mov ebp, esp
push ebx
push esi
push edi
mov ecx, [ebp+12]
movzx ecx, byte [ecx] ;put len in ecx, our loop variable
mov eax, 0
mov ebx, 0
mov esi, [ebp+8] ; the array
loop loop_1
loop_1:
cmp ecx, 0 ; if we're done
je done_1 ; then done with loop
mov edx, ecx
push ecx ; we save len, because loop command decrements ecx
sub edx, ecx
mov ecx, [esi+4*edx] ;;;;;; ecx now array[i] ? how do I access array[i] in a similar manner?
mov ebx, eax
shr ebx, 2 ; number of times for inner loop
loop_2:
cmp ebx, 0 ; we don't use loop to not affect ecx so we use ebx and compare it manually with 0
jl done_2
cmp [esi+ebx], ecx ;we see if array[ebx] os ecx so we can exit the loop
jle done_2
lea edx, [esi+ebx]
push dword [edx] ; pushing our array[ebx]
add edx, 4
pop dword [edx] ; popping the last one
dec ebx ; decrementing the loop iterator
jmp loop_2 ; looping again
done_2:
mov [esi+ebx+1], ecx
inc eax ; incrementing iterator
pop ecx ; len of array to compare now to eax and see if we're done
jmp loop_1
done_1:
pop edi
pop esi
pop ebx
pop ebp ; we pop them in opposite to how we pushed
ret
About the printf thing, I'm positive that I should push the parameters the opposite way (first S1 and then the integer so it'd be from left to right as we'd call it in C), and if I do switch them, nothing is printed at all while I'm getting a segmentation fault. I don't know what to do, it prints these as output:
welcome to sortMe, please sort me
5
16777216
65536
256
1
117440512
458752
1792
7
50331648
196608
768

mov ecx, [ebp+12] ;put len in ecx, our loop variab
This only moves the address of LEN into ECX not its value! You need to add movzx ecx, byte [ecx]
You also need to define LEN=48
loop loop_1
What's this bizare use of LOOP doing here?
You are mixing bytes and dwords on multiple occasions. You need to rework the code. p.e.
dec ebx ; ebx is now number of times we should go through inner loop
should become
shr ebx,2
This is not correct because you need the address and not the value. Change MOV into LEA.
jle done_2
mov edx, [esi+ebx]
Perhaps you can post your reworked code as an EDIT within your Original question.

Your edited code does not address ALL the problems signaled by user3144770!
The parameters to printf are correct but here are some additional problems with your printArray routine.
Since ESI is an index in an array of dwords you need to scale it up!
push dword [array+esi*4]
Are you sure pusha will save 32 bits ? Perhaps you'd better use pushad
ps Should you decide to rework your code and post the edit then please add the reworked code after the last line of the existing post. This way the original question will continue making sense to people viewing it the first time!

strlen in NASM Linux

Excuse me again. I am trying understand learn assembly languaje. However I have many problems. I am trying working with strings in NASM. I have copy a string constant to string variable. The maximum size is 50. So I want verify this bound. However this program throw a segmentation fault. I use a example in MASM, so perhaps exist a use error with NASM syntax.
My program is the following:
section .data
MAXTEXTSIZE equ 50
_cte_hola db "Hola", 0
_cte_mundo db "Mundo", 0
section .bss
MAIN_d resb MAXTEXTSIZE+1
section .text
global _start
strlen:
mov bx, 0
strl01:
cmp WORD [SI+BX],0 t
je strend
inc bx
jmp strl01
strend:
ret
strcpy:
call strlen
cmp bx, MAXTEXTSIZE
jle copiarsizeok
mov bx, MAXTEXTSIZE
copiarsizeok:mov cx, bx
cld
rep movsb
mov al,0
mov BYTE [DI], al
ret
_start:
mov ds, ax
mov es, ax
mov si, [MAIN_d]
mov di, [_cte_hola]
call strcpy
mov eax, 1
mov ebx, 0
int 80h
Thanks in advance and excuse me. My question are stupid for a assembly programmer.

I believe you are trying to make 32bit program in Linux, but your examples are 16bit.
In Linux, all pointers are 32bit. So, use extended registers: esi, edi, ebx etc. You still can use 8 and 16bit registers for arithmetics and data processing but not as memory pointers.
In strlen you have to compare byte [esi+ebx], 0 not word.
Don't set the segment registers in Linux. They will be set by the OS and you can't touch them. In Linux all memory is one flat area and you don't have to use segment registers anymore.

Here's a more concrete example of how you could write your strlen function (which is the first of your problems)
section .data
MAXTEXTSIZE equ 50
_cte_hola db "Hola", 0xa, 0
_cte_mundo db "Mundo", 0
section .bss
MAIN_d resb MAXTEXTSIZE+1
section .text
global _start
strlen:
mov ebx, 0
strlen_loop:
cmp BYTE [esi+ebx], 0
je strlen_end
inc ebx
jmp strlen_loop
strlen_end:
mov eax, ebx
ret
_start:
mov esi, _cte_hola
call strlen ; Get the length of _cte_hola
mov edx, eax ; The length was stored in eax by strlen
mov ecx, _cte_hola
mov ebx,1
mov eax, 4
int 0x80 ; Write to stdout
mov eax, 1
int 0x80 ; Exit
There are definitely better ways of implementing this (I'd use repne to implement strlen, for example) but I wanted to keep it close to your implementation.
Hope this helps!

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string