gdb: set scheduler_locking on always causes deadlock - linux

I am using gdb to debug a shared object loaded by a C program using dlopen() and dlsym(). The shared object is written in NASM, and it's compiled with DWARF debug symbols. I'm running on Ubuntu 18.04.
I want each thread to stop completely after it has been created so I can have all four threads created before I continue. So I start the debug session with "set scheduler-locking on" or "set scheduler-locking step."
As I understand the gdb command "set scheduler-locking on," it should allow me to create a thread, then switch back to thread 1 (the main thread) and create another thread, and so on until all threads have been created.
In the NASM code, I set a breakpoint at the label Test_fn: (see the code below). When a thread reaches that breakpoint, it stops (which is the first breakpoint after thread creation). Then I switch to back thread 1 (the main thread) and "continue" to instantiate the next thread (the main thread is still in the label_0 section of Init_Cores). Thread 1 will execute code (if I single-step through it), but at some point before the next thread is created it deadlocks, and it always deadlocks.
I have also tried single-stepping through thread creation (avoiding "continue"), but it still deadlocks.
According to the answer from Employed Russian at How to continue one thread at a time when debugging a multithreaded program in GDB?, that can happen but he doesn't say that it will always happen. Whether I use "set scheduler-locking on" or "set scheduler-locking step" I get a deadlock.
Maybe this is because it's a shared object, but everything else works with the shared object, so I don't think that's the problem.
Here is the NASM code. The same question would apply to C or C++ code, specifically why gdb deadlocks when I reach a breakpoint on a new thread, then switch back to thread 1 to continue creating threads. Thread 2 (the first created thread) should stop when scheduler-locking is on.
; Header Section
[BITS 64]
[default rel]
global Main_Entry_fn
extern pthread_create, pthread_join, pthread_exit, pthread_self, sched_getcpu
global FreeMem_fn
extern malloc, realloc, free
extern sprintf
section .data align=16
X_ctr: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
XMM_Stack: dq 0, 0, 0, 0, 0, 0, 0
ThreadID: dq 0
X_ptr: dq 0
X_length: dq 0
X: dq 0
collect_ptr: dq 0
collect_length: dq 0
collect_ctr: dq 0
even_squares_list_ptrs: dq 0, 0, 0, 0
even_squares_list_ctr: dq 0
even_squares_list_length: dq 0
Number_Of_Cores: dq 4
pthread_attr_t: dq 0
pthread_arg: dq 0
Join_Ret_Val: dq 0
tcounter: dq 0
sched_getcpu_array: times 4 dq 0
ThreadIDLocked: dq 0
spin_lock_core: times 4 dq 0
extra_test_array: times 4 dq 0
spin_lock_iter: times 4 dq 0
spin_lock_base_addr: dq 0
; __________
section .text
Init_Cores_fn:
%include "/opt/P01_SH/_Include_Utilities/Buffer_Pointer_Arrays.asm"
mov rax,[Number_Of_Cores]
mov rbx,8
mul rbx
mov [Number_Of_Cores],rax
; _____
; Create Threads
label_0:
; THREAD 1 WORKS IN THIS SECTION TO CREATE THREADS
mov rdi,ThreadID ; ThreadCount
mov rsi,pthread_attr_t ; Thread Attributes
mov rdx,Test_fn ; Function Pointer
mov rcx,pthread_arg
call pthread_create wrt ..plt
mov rdi,[ThreadID] ; id to wait on
mov rsi,Join_Ret_Val ; return value
call pthread_join wrt ..plt
mov rax,[tcounter]
add rax,8
mov [tcounter],rax
mov rbx,[Number_Of_Cores]
cmp rax,rbx
jl label_0
; _____
jmp label_900 ; All threads return here, and exit
; ______________________________________
Test_fn:
; Get the core number
call sched_getcpu wrt ..plt
mov rbx,8 ; multiply by 8
mul rbx
push rax
pop rax
mov rbx,rax
push rax
Next_Stop: ; THIS IS WHERE EACH THREAD STOPS
mov rdi,extra_test_array
mov [rdi+rbx],rbx
jmp label_899 ; ******************
;__________
label_899:
pop rax
ret
; __________
label_900:
%include "/opt/P01_SH/_Include_Utilities/Sys_Close_Include.asm"
mov rdi,extra_test_array
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
;The pointer is passed back in rcx (of course)
sub rsp,40
call free wrt ..plt
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
push r8
xor r8,r8
movsd [XMM_Stack+0],xmm13
movsd [XMM_Stack+8],xmm12
movsd [XMM_Stack+16],xmm11
movsd [XMM_Stack+24],xmm15
movsd [XMM_Stack+32],xmm14
movsd [XMM_Stack+40],xmm10
mov [X_ptr],rdi
mov [data_master_ptr],rsi
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; Write variables to assigned registers
mov r15,0
lea rdi,[rel collect_ptr]
mov r14,qword[rdi]
mov r13,[collect_ctr]
mov r12,[collect_length]
lea rdi,[rel X_ptr]
mov r11,qword[rdi]
mov r10,[X_length]
; __________
call Init_Cores_fn
movsd xmm10,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
movsd xmm11,[XMM_Stack+24]
movsd xmm12,[XMM_Stack+32]
movsd xmm13,[XMM_Stack+40]
pop r8
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
;__________
The NASM code is a long listing, but the focus is on label_0 (where threads are created) and Test_fn (the first breakpoint that a new thread reaches.
I would really appreciate any input to this problem with gdb scheduler locking. Thanks.

Your assembly program appears to do this:
pthread_create(&ThreadID, ..., Test_fn, ...);
pthread_join(ThreadID, ...);
First, this doesn't actually give you any parallelism, as you create and immediately wait for the thread you just created. You could just as easily call Test_fn() directly, with less overhead.
Second, if you stop the newly-created thread (as you do with scheduler-locking), then your main thread will block waiting to join that newly-created thread, and it will block forever. So of course your program will deadlock.
Update:
I thought I need to call pthread_join right after pthread_create so all threads will finish before the main thread exits.
That is correct. But they way to do that generally looks like this:
const int NThreads = ...;
pthread_t tids[NThreads];
for (int j = 0; j < NThreads; j++)
pthread_create(&tids[j], ...);
// All threads have started, and are now running in parallel with the main thread.
// Wait for them to finish.
for (int j = 0; j < NThreads; j++)
pthread_join(tids[j], ...);
// All done.
return 0;

Related

lock cmpxchg fails to execute threads in core order

The following 64-bit NASM code uses lock cmpxchg to take each core in core order, execute some code, then reset the core number variable using xchg so the next core can execute the code. The core number for each core is stored in rbx -- the four cores are numbered 0, 8, 16 and 24. The variable [spin_lock_core] starts at zero and when each core is finished it updates the core number by 8 at the final line xchg [spin_lock_core],rax.
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
But before the code reaches xchg [spin_lock_core],rax the first core loops out of the program (jmp label_899), which should cause the other threads to freeze because they would be waiting for the [spin_lock_core] var to be updated, which never happens. But instead all four cores are written to the output array extra_test_array, which is displayed on the terminal when the program exits. In other words, this fails to stop the cores until the core number is updated.
The full, minimal code is below (as minimal as NASM can be in this case). The code is written for a shared object, and it's reproducible if it gets an input array (as written it doesn't matter if the input array is int or float):
; Header Section
[BITS 64]
[default rel]
global Main_Entry_fn
extern pthread_create, pthread_join, pthread_exit, pthread_self, sched_getcpu
global FreeMem_fn
extern malloc, realloc, free
extern sprintf
section .data align=16
X_ctr: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
XMM_Stack: dq 0, 0, 0, 0, 0, 0, 0
ThreadID: dq 0
X_ptr: dq 0
X_length: dq 0
X: dq 0
collect_ptr: dq 0
collect_length: dq 0
collect_ctr: dq 0
even_squares_list_ptrs: dq 0, 0, 0, 0
even_squares_list_ctr: dq 0
even_squares_list_length: dq 0
Number_Of_Cores: dq 32
pthread_attr_t: dq 0
pthread_arg: dq 0
Join_Ret_Val: dq 0
tcounter: dq 0
sched_getcpu_array: times 4 dq 0
ThreadIDLocked: dq 0
spin_lock_core: dq 0
extra_test_array: dq 0
; __________
section .text
Init_Cores_fn:
; _____
; Create Threads
label_0:
mov rdi,ThreadID ; ThreadCount
mov rsi,pthread_attr_t ; Thread Attributes
mov rdx,Test_fn ; Function Pointer
mov rcx,pthread_arg
call pthread_create wrt ..plt
mov rdi,[ThreadID] ; id to wait on
mov rsi,Join_Ret_Val ; return value
call pthread_join wrt ..plt
mov rax,[tcounter]
add rax,8
mov [tcounter],rax
mov rbx,[Number_Of_Cores]
cmp rax,rbx
jl label_0
; _____
jmp label_900 ; All threads return here, and exit
; ______________________________________
Test_fn:
; Get the core number
call sched_getcpu wrt ..plt
mov rbx,8 ; multiply by 8
mul rbx
push rax
pop rax
mov rbx,rax
push rax
Spin_lock:
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
;__________
label_899:
pop rax
ret
; __________
label_900:
mov rdi,extra_test_array ;audit_array
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
;The pointer is passed back in rcx (of course)
sub rsp,40
call free wrt ..plt
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
push r8
xor r8,r8
movsd [XMM_Stack+0],xmm13
movsd [XMM_Stack+8],xmm12
movsd [XMM_Stack+16],xmm11
movsd [XMM_Stack+24],xmm15
movsd [XMM_Stack+32],xmm14
movsd [XMM_Stack+40],xmm10
mov [X_ptr],rdi
mov [data_master_ptr],rsi
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; Write variables to assigned registers
mov r15,0
lea rdi,[rel collect_ptr]
mov r14,qword[rdi]
mov r13,[collect_ctr]
mov r12,[collect_length]
lea rdi,[rel X_ptr]
mov r11,qword[rdi]
mov r10,[X_length]
; __________
call Init_Cores_fn
movsd xmm10,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
movsd xmm11,[XMM_Stack+24]
movsd xmm12,[XMM_Stack+32]
movsd xmm13,[XMM_Stack+40]
pop r8
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
The instruction "lock cmpxchg" should fail until the [spin_lock_core] variable is updated, but it doesn't do that.
Thanks for any help in understanding why lock cmpxchg doesn't prevent the cores after core zero from firing in this area of code.
UPDATE: other research shows that xor rax,rax is needed at the top of the Spin_lock: section. When I insert that line, it reads like this:
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
With that change it freezes, as expected. But when I remove the line jmp label_899 it still freezes, but it shouldn't do that.
EDIT 122219:
Based on the comments on this question yesterday, I revised the spinlock code to (1) eliminate atomic operations in favor of faster mov and cmp instructions, (2) assign a unique memory location to each core, and (3) separate the memory locations by > 256 bytes to avoid memory on the same cache line.
Each core's memory location will be changed to 1 when the previous core is finished. When each core finishes, it sets its own memory location back to 0.
The code successfully executes core 0 IF I have all other cores loop out before the spinlock. When I let all four cores run through the spinlock, the program again hangs.
I've verified that each separate memory location is set to 1 when the previous core is finished.
Here's the updated spinlock section:
section .data
spin_lock_core: times 140 dq 0
spin_lock_core_offsets: dq 0,264,528,792
section .text
; Calculate the offset to spin_lock_core
mov rbp,spin_lock_core
mov rdi,spin_lock_core_offsets
mov rax,[rdi+rbx]
add rbp,rax
; ________
Spin_lock:
pause
cmp byte[rbp],1
jnz Spin_lock
xor rax,rax
mov [rbp],rax ; Set current memory location to zero
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rdx
mov rcx,rax
; Loop out if this is the last core
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge label_899
; Set next core to 1 by adding 264 to the base address
add rbp,264
mov rax,1
mov [rbp],rax
Why does this code still hang?
I don't think you should use cmpxchg for this at all. Try this:
Spin_lock:
pause
cmp [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
lea rax,[rbx+8]
mov [spin_lock_core],rax
I solved this spinlock problem, but after Peter Cordes' comment below I see that it is not correct. I won't delete this answer because I hope it can lead to the solution.
I use lock cmpxchg [rbp+rbx],rbx, which assembles without error, but the NASM assembler should return a "invalid combination of operands" error because the source operand can only be rax, so it shouldn't assemble with any other register. I also note that the online resources (for example, https://www.felixcloutier.com/x86/cmpxchg) show the format as CMPXCHG r/m64,r64, but the source operand can't be any r64 -- it must be rax, as that entry goes on to say.
Without the "mov rax,rbx" line it works because on the first iteration the rax register is set to 0 which matches the memory location. On the second iteration it succeeds by default.
When I add "mov rax,rbx" -- which resets rax -- the program once again hangs. I would really appreciate any ideas on why this program should hang as written.
At the start of this block rbx is the core number:
section .data
spin_lock_core: times 4 dq 0
section .text
[ Code leading up to this spinlock section shown above ]
mov rbp,spin_lock_core
Spin_lock:
pause
mov rax,rbx
lock cmpxchg [rbp+rbx],rax
jnz Spin_lock
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge spin_lock_out
xchg [rbp+rax],rax
spin_lock_out:
The differences from my original post are:
Each core spins on (and reads from) its own unique memory location.
I use the "pause" instruction on the spinlock.
Each unique memory location is updated in core order.
But it does not work when I include mov rax,rbx. Intuitively that should work, so I will really appreciate any ideas on why it doesn't in this case.

Windows multicore program worked before but suddenly threads execute randomly

Below is a multithreaded (4-core) NASM 64 program. It's not minimal, but it's complete; I posted the complete code because a minimal example may not reveal the problem in this code.
This is my first multithreaded multicore program in NASM, and I was happy to see yesterday afternoon that it worked correctly (all four cores) and returned the values I was expecting. I ran it successfully several times yesterday and again several times this morning.
Suddenly, about 10 minutes after my last run, without making any changes to the code, without re-assembling the dll, I ran it again and this time the threads execute randomly -- not all threads execute, and the configuration of threads that execute varies from run to run -- sometimes thread 1 only; sometimes threads 2-4 but not thread 1, sometimes only thread 2 and 4, etc.
My code did not have ExitThread or CloseHandle calls, so I added them in (see label_899:), rebooted and ran it again. The first run after a fresh reboot still shows random execution of threads.
I thought the problem may be that CreateThread was failing, so I also returned the thread handles. Every time, all four thread handles were created even though all did not execute, so that's not it.
I have done a lot of research but I haven't seen this issue discussed at all. Even though this is NASM, the same thing could also be applicable to C or C++ multithreading. I use WaitForMultipleObjects to wait on all threads to complete.
This is a dll, and the entry point is Main_Entry_fn. The dll is called from Python using ctypes, but I doubt that makes any difference. I use ctypes frequently and have never had a problem with it.
Thanks for any ideas.
P.S. ignore the realloc code because the buffers are large enough that realloc is never called.
UPDATE: Here is how I constructed the progam flow: the entry point is Main_Entry_fn toward the bottom of the program listing (it's an export because it has to be exported for a dll call). Main_Entry_fn takes the input data from the calling function, preserves registers (because the whole program is register optimized) and allocates output buffers. Main_Entry_fn then calls Init_Cores_fn (top of program listing), where we create four arrays with data for the registers on entry to each core (remember all vars are stored in registers -- see the call to DupThreadInfo), and creates four threads in a loop. The threads all call Prime_Number_fn, and the threads are set to start immediately upon creation.
Register assignments:
r15 list_of_results_ptr (pointer to list_of_results)
r14 list_of_results_ctr (counter for list_of_results)
r13 list_of_results_length
r12 numbers_ptr (pointer to "numbers" input array)
r11 numbers_ctr
r10 numbers_length
r9 num
r8 i
xmm15 num_float
xmm14: result
; Header Section
[BITS 64]
[default rel]
extern malloc, calloc, realloc, free
global Main_Entry_fn
export Main_Entry_fn
global FreeMem_fn
export FreeMem_fn
extern CreateThread, CloseHandle, ExitThread
extern WaitForMultipleObjects, GetCurrentThread
section .data align=16
Return_Pointer_Array: dq 0, 0, 0
Input_Length_Array: dq 0, 0,
list_of_results_ptr: dq 0
list_of_results_ctr: dq 0
list_of_results_length: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
internal_dynamic_length: dq 5000
XMM_Stack: dq 0, 0, 0
numbers_ptr: dq 0
numbers_length: dq 0
numbers: dq 0
list_of_results: dq 0
num_float: dq 0.0
loop_counter_401: dq 0
numbers_ctr: dq 0
num: dq 0
result: dq 0
const_1: dq 1
i: dq 0
range_loop_start_i: dq 0
range_loop_end_i: dq 0
const_0: dq 0
; New vars for threads:
ThreadCount: times 4 dq 0
ThreadInfo: times 10 dq 0
ThreadInfo2: times 10 dq 0
ThreadInfo3: times 10 dq 0
ThreadInfo4: times 10 dq 0
TestInfo: times 4 dq 0
ThreadHandles: times 4 dq 0
Return_Data_Array: times 8 dq 0 ; ptr,ptr,ptr,ptr,length,length,length,length
StartByte: dq 0
stride: dq 8
output_buffer_pointers: times 4 dq 0 ; for malloc
Division_Size: dq 0
Division_Start: dq 0
section .text
; ______________________________________
Init_Cores_fn:
; Calculate the data divisions
mov rax,r10
mov rbx,4 ;cores
xor rdx,rdx
div rbx
mov [Division_Size],rax
mov [Division_Start],rax
; Populate the ThreadInfo array with vars to pass
; ThreadInfo: length, startbyte, stride, vars into registers on entry to each core
mov rdi,ThreadInfo
mov rax,0 ;ThreadInfoLength
mov [rdi],rax ; length (number of vars into registers plus 3 elements)
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp]
mov [rdi+24],rax ;for r15
mov [rdi+32],r14
mov [rdi+40],r13
mov [rdi+48],r12
mov rax,[Division_Start]
mov [rdi+56],rax
mov rax,0
mov [rdi+64],rax
call DupThreadInfo
mov rbp,rsp ; preserve caller's stack frame
sub rsp,56 ; Shadow space (was 32)
; _____
label_0:
mov rax,[StartByte]
cmp rax,0
jne sb2
mov rdi,ThreadInfo
jmp sb5
sb2:cmp rax,8
jne sb3
mov rdi,ThreadInfo2
jmp sb5
sb3:cmp rax,16
jne sb4
mov rdi,ThreadInfo3
jmp sb5
sb4:cmp rax,24
jne sb5
mov rdi,ThreadInfo4
sb5:
; _____
; Create Threads
mov rcx,0 ; lpThreadAttributes (Security Attributes)
mov rdx,0 ; dwStackSize
mov r8,PrimeNumber_fn ; lpStartAddress (function pointer)
mov r9,rdi ; lpParameter (array of data passed to each core)
mov rax,0
mov [rsp+32],rax ; use default creation flags
mov rdi,ThreadCount
mov [rsp+40],rdi ; ThreadID
call CreateThread
; Move the handle into ThreadHandles array (returned in rax)
mov rdi,ThreadHandles
mov rcx,[StartByte]
mov [rdi+rcx],rax
mov rax,[StartByte]
add rax,8
mov [StartByte],rax
mov rbx,32 ; Four cores
cmp rax,rbx
jl label_0
; _____
; Wait
mov rcx,4 ;rax ; number of handles
mov rdx,ThreadHandles ; pointer to handles array
mov r8,0 ; wait for all threads to complete
mov r9,5000 ; milliseconds to wait
call WaitForMultipleObjects
; _____
mov rsp,rbp
jmp label_900
; ______________________________________
PrimeNumber_fn:
; Populate registers
;(note: rcx is the return value for ThreadProc)
mov rdi,rcx
mov rax,[rdi]
mov r15,[rdi+24]
mov r13,[rdi+40]
mov r12,[rdi+48]
mov r10,[rdi+56]
xor r11,r11
xor r9,r9
mov r8,[rdi+8] ; start byte
pxor xmm15,xmm15
pxor xmm15,xmm14
pxor xmm15,xmm13
; Get the ThreadID based on startbyte
mov rax,[rdi+72] ; 0, 8, 16, 24 for 4-core
push rax
;______
label_401:
mov rdi,r12 ; Pointer
cmp r8,r10
jge label_899
movsd xmm0,qword[rdi+r8]
movsd xmm15,xmm0
add r8,8
;______
cvttsd2si rax,xmm15
mov r9,rax
;______
label_8010:
label_801:
cmp r9,[const_1]
jle label_401
;______
label_12010:
mov rax,2
sub rax,1
movq xmm13,rax
label_1201:
movq rcx,xmm13
movq xmm1,[const_1]
addsd xmm13,xmm1
movq rcx,xmm13
mov rdx,r9
cmp rcx,rdx
jge label_12020
;______
label_16010:
label_1601:
mov rax,r9
movq rbx,xmm13
xor rdx,rdx
div rbx
mov rax,rdx
mov rdx,0
cmp rax,rdx
jne label_16020
;______
movq xmm14,[const_0]
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_17
;[Irrelevant code omitted]
next_17:
;______
jmp label_401
;______
label_1602:
label_16020:
;______
mov rax,r9
cvtsi2sd xmm14,rax
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_21
;[Irrelevant code omitted]
next_21:
jmp label_1201
;______
label_1202:
label_12020:
;______
movq xmm14,[const_0]
;______
movq rax,xmm14
cvtsi2sd xmm0,rax
movsd [r15+r14],xmm14
add r14,8
mov rax,r13
cmp r14,rax
jl next_25
;[Irrelevant code omitted]
next_25:
jmp label_401
;______
label_899:
pop rax
mov rdi,Return_Data_Array
mov [rdi+rax],r15
mov [rdi+rax+32],r14 ; 32 = four cores
call ExitThread
call GetCurrentThread
mov rcx,rax
call CloseHandle
ret
; __________
label_900:
exit_label_for_PrimeNumber_fn:
mov rdi,Return_Data_Array ; Final return to the calling process
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
sub rsp,40
call free
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
movsd [XMM_Stack+0],xmm15
movsd [XMM_Stack+8],xmm14
movsd [XMM_Stack+16],xmm13
push r8
xor r8,r8
mov [numbers_ptr],rcx
mov [data_master_ptr],rdx
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [numbers_length],rax
add rcx,8
; __________
; malloc for dynamic arrays
lea rdi,[data_master_ptr]
mov rbp,[rdi]
movsd xmm0,qword[rbp]
cvttsd2si rax,xmm0
mov r8,rax
mov rdx,10
mul rdx
mov rdx,10000000
cmp rax,rdx
jl malloc_next
mov rax,r8
malloc_next:
mov rax,50000000
mov [initial_dynamic_length],rax
;__________
; Output buffer #1
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #2
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+8],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #3
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+16],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
;__________
; Output buffer #4
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov rdi,output_buffer_pointers
mov [rdi+24],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
; __________
; Write variables to assigned registers
lea rdi,[rel list_of_results_ptr]
mov r15,qword[rdi]
mov r14,[list_of_results_ctr]
mov r13,[list_of_results_length]
lea rdi,[rel numbers_ptr]
mov r12,qword[rdi]
mov r11,[numbers_ctr]
mov r10,[numbers_length]
mov r9,[num]
movsd xmm15,[num_float]
movsd xmm14,[result]
movsd xmm13,[i]
mov r8,[loop_counter_401]
; __________
call Init_Cores_fn
exit_label_for_Main_Entry_fn:
pop r8
movsd xmm13,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
;_____________
DupThreadInfo:
mov rdi,ThreadInfo2
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
add rax,[Division_Size]
mov [Division_Start],rax
sub rax,8
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+8]
mov [rdi+24],rax
mov rax,8
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; 56 is above
mov rax,8
mov [rdi+64],rax
mov rax,8 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
; _____
mov rdi,ThreadInfo3
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
mov rbx,[Division_Size]
add rax,rbx
mov [Division_Start],rax
sub rax,8
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+16]
mov [rdi+24],rax
mov rax,16
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; [rdi+56] is above
mov rax,16
mov [rdi+64],rax
mov rax,16 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
mov rdi,ThreadInfo4
; StartByte and EndByte
mov rax,[Division_Start]
mov [rdi+8],rax
mov rax,[numbers_length] ; final segment goes to the end
mov [rdi+56],rax
mov rax,[stride]
mov [rdi+16],rax ; 8 x number of cores (32 in this example)
; Vars (these registers are populated on main entry)
mov rbp,output_buffer_pointers
mov rax,[rbp+24]
mov [rdi+24],rax
mov rax,24
mov [rdi+32],rax
mov [rdi+40],r13
mov [rdi+48],r12
; [rdi+56] is above
mov rax,24
mov [rdi+64],rax
mov rax,24 ; Thread number based on startbyte, for Return_Data_Array
mov [rdi+72],rax
ret

linux debugger detection in multi-thread application using ptrace

I have to implement debugger detection technique under linux. So the main idea is, my piece of code creates second thread via syscall clone. After that, created thread is supposed to check if debugger is present in while loop, sleeping for a few seconds. My question is how to implement debugger detection via ptrace in multi-thread environment inside infinite loop. My problem is that after calling ptrace(PTRACE_TRACEME, 0, 1, 0) for a second time debugger is detected (that's reasonable and correct of course). So do I have to detach tracer somehow at the end of loop or use ptrace in another way? Here is a piece of code:
new_thread:
; PTRACE
xor rdi, rdi
xor rsi, rsi
xor rdx, rdx
inc rdx
xor r10, r10
mov rax, 101 ; ptrace syscall
syscall
cmp rax, 0
jge __nondbg
call _dbg
db 'debugged!', 0xa, 0
_dbg:
mov rdi, 1
pop rsi
mov rdx, 10
mov rax, 1 ; syscall write
syscall
; exit_group call
mov rdi, 127
mov rax, 231 ; exit_group syscall
syscall
__nondbg:
call _nondbg
db 'non-debugged!', 0xa, 0
_nondbg:
mov rdi, 1
pop rsi
mov rdx, 14
mov rax, 1 ; syscall write
syscall
; ==========
; SLEEP.....
; ==========
push 0 ; value should be a parameter
push 5 ; value should be a parameter
mov rdi, rsp
xor rsi, rsi
mov rax, 35 ; syscall nanosleep
syscall ; syscall
pop rax
pop rax
jmp new_thread
I don't know if your design forces you to try a loop detection. PTRACE_TRACEME is used by a tracee process to be traced by its parent (after fork). I admit i don't know for sure how this would work when the tracer is another thread in the same process, but i think it wouldn't work very well, as the mechanism of ptrace is based on signals.
If you want to be sure that your (child) process is being attached to the traces, the common approach is to raise a stop signal to allow the tracer to attach. When the execution is resumed, you know the tracer is there.
raise(SIGSTOP);

nasm assembly linux timer or sleep

I'm trying to find a way to make my code wait for two seconds before proceeding. I'm using nasm for Linux in protected mode, so I can only use int 80h. I found a syscall called "alarm" (27) and another called "pause" (29). However, when I try to use those, the program waits and finishes instead of continuing execution. I've also found another syscall, sigaction, which changes the behavior of a signal (so I think it can be used to make the program ignore the signal generated by alarm instead of exiting) but I didn't quite understand how sigaction works. Thanks for any help.
Useful links:http://man7.org/linux/man-pages/man2/alarm.2.html
http://man7.org/linux/man-pages/man2/sigaction.2.html
There is a system call for sleeping the program, sys_nanosleep:
sys_nanosleep : eax = 162, ebx = struct timespec *, ecx = struct timespec *
this struct timespec structure has two members:
;; This is for 32-bit. Note that x86-64 uses 2x 64-bit members
tv_sec ; 32 bit seconds
tv_nsec ; 32 bit nanoseconds
this structure can be declared in nasm as:
section .data
timeval:
tv_sec dd 0
tv_usec dd 0
and then you sets the values and call it as:
mov dword [tv_sec], 5
mov dword [tv_usec], 0
mov eax, 162
mov ebx, timeval
mov ecx, 0
int 0x80
the program then will sleep for 5 seconds. A complete example:
global _start
section .text
_start:
; print "Sleep"
mov eax, 4
mov ebx, 1
mov ecx, bmessage
mov edx, bmessagel
int 0x80
; Sleep for 5 seconds and 0 nanoseconds
mov dword [tv_sec], 5
mov dword [tv_usec], 0
mov eax, 162
mov ebx, timeval
mov ecx, 0
int 0x80
; print "Continue"
mov eax, 4
mov ebx, 1
mov ecx, emessage
mov edx, emessagel
int 0x80
; exit
mov eax, 1
mov ebx, 0
int 0x80
section .data
timeval:
tv_sec dd 0
tv_usec dd 0
bmessage db "Sleep", 10, 0
bmessagel equ $ - bmessage
emessage db "Continue", 10, 0
emessagel equ $ - emessage
With NASM, if you are targeting Linux x86-64, you can simply do something similar to the following:
global _start
section .data
timespec:
tv_sec dq 1
tv_nsec dq 200000000
section .text
_start:
mov rax, 35
mov rdi, timespec
xor rsi, rsi
syscall
...
35 corresponds to the 64-bit system call number for sys_nanosleep (as listed here). If the call is interrupted, the remaining sleep time is written to the memory location pointed by register rsi; in this example rsi is set to 0 to ignore the value if it happens. This call will sleep for tv_sec seconds + tv_nsec nanoseconds, 1.2 seconds in the above code snippet.
More information about this system call can be found in the nanosleep man page.

Does int 0x80 overwrite register values? [duplicate]

This question already has an answer here:
What happens if you use the 32-bit int 0x80 Linux ABI in 64-bit code?
(1 answer)
Closed 4 years ago.
I wrote a program which is supposed to behave like a for while loop, printing a string of text a certain number of times.
Here is the code:
global _start
section .data
msg db "Hello World!",10 ; define the message
msgl equ $ - msg ; define message length
; use minimal size of storage space
imax dd 0x00001000 ; defines imax to be big!
section .text
_start:
mov r8, 0x10 ; <s> put imax in r8d, this will be our 'i' </s>
; just attempt 10 iterations
_loop_entry: ; loop entry point
mov eax, 4 ; setup the message to print
mov ebx, 1 ; write, stdout, message, length
mov ecx, msg
mov edx, msgl
int 0x80 ; print message
; this is valid because registers do not change
dec r8 ; decrease i and jump on not zero
cmp r8,1 ; compare values to jump
jnz _loop_entry
mov rax, 1 ; exit with zero
mov rbx, 0
int 0x80
The problem I have is the program runs into an infinite loop. I ran it inside gdb and the cause is:
int 0x80 is called to print the message, and this works correctly, however after the interrupt finishes, the contents of r8 is set to zero, rather than the value it should be. r8 is where the counter sits, counting (down) the number of times the string is printed.
Does int 0x80 modify register values? I noticed that rax, rbx, rcx, rdx were not affected in the same way.
Test Results
Answer: YES! It does modify r8.
I have changed two things in my program. Firstly I now cmp r8, 0, to get Hello World! the correct number of times, and
I have added
mov [i], r8 ; put away i
After _loop_entry:
and also I have added
mov r8, [i] ; get i back
after the first int 0x80.
Here is my now working program. More info to come on performance against C++.
;
; main.asm
;
;
; To be used with main.asm, as a test to see if optimized c++
; code can be beaten by me, writing a for / while loop myself.
;
;
; Absolute minimum code to be competative with asm.
global _start
section .data
msg db "Hello World!",10 ; define the message
msgl equ $ - msg ; define message length
; use minimal size of storage space
imax dd 0x00001000 ; defines imax to be big!
i dd 0x0 ; defines i
section .text
_start:
mov r8, 0x10 ; put imax in r8d, this will be our 'i'
_loop_entry: ; loop entry point
mov [i], r8 ; put away i
mov eax, 4 ; setup the message to print
mov ebx, 1 ; write, stdout, message, length
mov ecx, msg
mov edx, msgl
int 0x80 ; print message
; this is valid because registers do not change
mov r8, [i] ; get i back
dec r8 ; decrease i and jump on not zero
cmp r8,0 ; compare values to jump
jnz _loop_entry
mov rax, 1 ; exit with zero
mov rbx, 0
int 0x80
int 0x80 just causes a software interrupt. In your case it's being used to make a system call. Whether or not any registers are affected will depend on the particular system call you're invoking and the system call calling convention of your platform. Read your documentation for the details.
Specifically, from the System V Application Binary Interface x86-64™ Architecture Processor Supplement [PDF link], Appendix A, x86-64 Linux Kernel Conventions:
The interface between the C library and the Linux kernel is the same as for the user-level applications...
For user-level applications, r8 is a scratch register, which means it's caller-saved. If you want it to be preserved over the system call, you'll need to do it yourself.

Resources