Posix memalign fails on Broadwell but not on Skylake

Posix memalign fails on Broadwell but not on Skylake - linux

The following code uses Posix Memalign to allocate four buffers per core, of 128 bytes each. It succeeds on a Skylake but it fails on a Broadwell (earlier generation) with the following message:
posix memalign malloc.c:2401: sysmalloc: Assertion `(old_top == initial_top (av) && old_size == 0) || ((unsigned long) (old_size) >= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0)' failed
According to the Linux man pages, memalign will fail in the following cases: (1) the alignment argument was not a power of two, or was not a multiple of sizeof(void *), or (2) there was insufficient memory to fulfill the allocation request. As it generates a segmentation fault, I can't get an error number from rax. The Broadwell has 8GB of memory, so insufficient memory is not the problem. The alignment is 64 so that's not the problem, and in any case it succeeds on my Skylake, so it's written correctly.
Here's the relevant code block:
mov rax,2 ; number of cores
mov rbx,4 ; number of buffers to create
mul rbx
mov r14,rax
xor r13,r13
Memalign_Create_Loop:
mov rax,r15 ; number of cores
mov rbx,r12 ; number of buffers needed
; N ptrs per core x 8 bytes per pointer
mul rbx ; times the number of cores
mov r12,rax ; number of buffers needed x number of cores
lea rdi,[memalign_pointer]
mov rsi,64 ; alignment
mov rdx,r12
shl rdx,3
mov rdx,128 ; buffer size
;xor rax,rax
sub rsp,40
call posix_memalign wrt ..plt
add rsp,40
lea r8,[internal_buffer_pointers]
lea rdi,[memalign_pointer]
mov rax,[rdi]
mov rbx,r13
shl rbx,3
mov [r8+rbx],rax
add r13,1
cmp r13,r14
jl Memalign_Create_Loop
It fails at "call posix_memalign wrt ..plt" and displays the error message shown above, along with a segmentation fault message.
It's a puzzle to me because it succeeds on the Skylake, and posix_memalign predates Broadwell.
I assemble and link with:
sudo nasm -f elf64 -g -F dwarf NWL.asm
sudo ld -shared NWL.o /opt/P01_SH/_Library/Create_Threads_in_C-NWL.o /opt/P01_SH/_Library/Timer_for_NASM.o /opt/P01_SH/_Library/POSIX_Shared_Memory.o /opt/P01_SH/_Library/PThread_Mutex.o /opt/P01_SH/_Library/Create_Multi_Files.o -ldl -lrt -lpthread -lm -o NWL.so
Thanks for any ideas on this.

I solved this problem working from the comment above by kisch ("The assertion message seems to indicate a case of heap corruption. Then the actual bug would be in earlier code, the corruption is only detected once your function tries another allocation").
My NASM code includes two files with memory allocation code. Before the fix it was:
; Create buffers to hold POSIX shared memory:
mov rbx,2 ; 2 output buffers per core
%include "/opt/P01_SH/_Include_Utilities/POSIX_Shared_Memory.asm"
mov r15,1 ;[Number_Of_Cores_Seq]
mov r12,4 ; number of internal_buffers needed
%include "/opt/P01_SH/_Include_Utilities/Posix_Memalign_Internal.asm"
To fix this, I reversed the order of the calls so that Posix_Memalign_Internal.asm is included before Posix_Shared_Memory.asm.
This is the code from Posix_Memalign_Internal.asm:
mov rax,r15
mov rbx,r12 ; passed in from calling function
mul rbx
mov r14,rax
xor r13,r13
Memalign_Create_Loop:
; Allocate the buffer for the memory pointers
; # of cores times N memory pointers needed (passed in from caller)
; int posix_memalign(void **memptr, size_t alignment, size_t size);
mov rax,r15 ; number of cores
mov rbx,r12 ; number of buffers needed
; N ptrs per core x 8 bytes per pointer
mul rbx ; times the number of cores
mov r12,rax ; number of buffers needed x number of cores
lea rdi,[memalign_pointer]
mov rsi,64 ; alignment
mov rdx,r12
shl rdx,3
mov rdx,128 ; buffer size
sub rsp,40
call posix_memalign wrt ..plt
add rsp,40
lea r8,[internal_buffer_pointers]
lea rdi,[memalign_pointer]
mov rax,[rdi]
mov rbx,r13
shl rbx,3
mov [r8+rbx],rax
add r13,1
cmp r13,r14
jl Memalign_Create_Loop
This is the code from Posix_Shared_Memory.asm:
mov rax,[Number_Of_Cores_Seq]
mul rbx
mov [shm_buffers_count],rax
shl rax,3
mov [shm_buffers_bytes],rax
mov r12,rax ; length of pointer buffer
; Buffer for shm pointers
mov rdi,r12
sub rsp,40
call [rel malloc wrt ..got]
mov qword [shm_buffers_ptr],rax
add rsp,40
; Buffer for shm fds
mov rdi,r12
sub rsp,40
call [rel malloc wrt ..got]
mov qword [shm_fds_ptr],rax
add rsp,40
; _________
; Allocate shared memory
xor r12,r12 ; buffer # (0, 8, 16, 24)
xor r13,r13 ; core# (0, 1, 2, 3, ...)
xor r14,r14 ; counter for shm_buffer
Get_ShmName:
lea rdi,[shm_base_name]
mov rsi,r12
mov rdx,r13
call [rel get_shm_name wrt ..got]
mov rdi,rax
push rdi ; shm_name ptr
call [rel unlink_shm wrt ..got]
pop rdi
; Create the shm buffer
mov rsi,[shm_size]
call [rel open_shm wrt ..got]
get_buffer:
mov rdi,rax
mov rax,[rdi]
get_buffer_data:
mov rbp,[shm_fds_ptr]
mov [rbp+r14],rax ; shm_fds
mov rbp,[shm_buffers_ptr]
mov rax,[rdi+8]
mov [rbp+r14],rax ; shm_ptrs
add r12,8
add r14,8
cmp r12,[shm_buffers_bytes]
jl Get_ShmName
next_core:
xor r12,r12
add r13,1
cmp r13,[Number_Of_Cores_Seq]
jl Get_ShmName
Posix_Memalign_Internal.asm uses posix_memalign to create four small buffers of 128 bytes each. Posix_Shared_Memory.asm creates two 4MB shared memory buffers.
My conclusion from this is that when posix_memalign and posix shared memory are used in the same program, posix_memalign should be called before posix shared memory buffers are created. In the final executable they appear on after the other with three instructions in between.
I don't see anything in the code blocks that explains why this is so, but it fixed this problem here.
As a final note, it failed today on a Skylake, so the CPU family is not relevant, as Nate Eldredge said in his comment above.
Thanks to all who responded.

Related

Compact shellcode to print a 0-terminated string pointed-to by a register, given puts or printf at known absolute addresses?

Background: I am a beginner trying to understand how to golf assembly, in particular to solve an online challenge.
EDIT: clarification: I want to print the value at the memory address of RDX. So “SUPER SECRET!”
Create some shellcode that can output the value of register RDX in <= 11 bytes. Null bytes are not allowed.
The program is compiled with the c standard library, so I have access to the puts / printf statement. It’s running on x86 amd64.
$rax : 0x0000000000010000 → 0x0000000ac343db31
$rdx : 0x0000555555559480 → "SUPER SECRET!"
gef➤ info address puts
Symbol "puts" is at 0x7ffff7e3c5a0 in a file compiled without debugging.
gef➤ info address printf
Symbol "printf" is at 0x7ffff7e19e10 in a file compiled without debugging.
Here is my attempt (intel syntax)
xor ebx, ebx ; zero the ebx register
inc ebx ; set the ebx register to 1 (STDOUT
xchg ecx, edx ; set the ECX register to RDX
mov edx, 0xff ; set the length to 255
mov eax, 0x4 ; set the syscall to print
int 0x80 ; interrupt
hexdump of my code
My attempt is 17 bytes and includes null bytes, which aren't allowed. What other ways can I lower the byte count? Is there a way to call puts / printf while still saving bytes?
FULL DETAILS:
I am not quite sure what is useful information and what isn't.
File details:
ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=5810a6deb6546900ba259a5fef69e1415501b0e6, not stripped
Source code:
void main() {
char* flag = get_flag(); // I don't get access to the function details
char* shellcode = (char*) mmap((void*) 0x1337,12, 0, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
mprotect(shellcode, 12, PROT_READ | PROT_WRITE | PROT_EXEC);
fgets(shellcode, 12, stdin);
((void (*)(char*))shellcode)(flag);
}
Disassembly of main:
gef➤ disass main
Dump of assembler code for function main:
0x00005555555551de <+0>: push rbp
0x00005555555551df <+1>: mov rbp,rsp
=> 0x00005555555551e2 <+4>: sub rsp,0x10
0x00005555555551e6 <+8>: mov eax,0x0
0x00005555555551eb <+13>: call 0x555555555185 <get_flag>
0x00005555555551f0 <+18>: mov QWORD PTR [rbp-0x8],rax
0x00005555555551f4 <+22>: mov r9d,0x0
0x00005555555551fa <+28>: mov r8d,0xffffffff
0x0000555555555200 <+34>: mov ecx,0x22
0x0000555555555205 <+39>: mov edx,0x0
0x000055555555520a <+44>: mov esi,0xc
0x000055555555520f <+49>: mov edi,0x1337
0x0000555555555214 <+54>: call 0x555555555030 <mmap#plt>
0x0000555555555219 <+59>: mov QWORD PTR [rbp-0x10],rax
0x000055555555521d <+63>: mov rax,QWORD PTR [rbp-0x10]
0x0000555555555221 <+67>: mov edx,0x7
0x0000555555555226 <+72>: mov esi,0xc
0x000055555555522b <+77>: mov rdi,rax
0x000055555555522e <+80>: call 0x555555555060 <mprotect#plt>
0x0000555555555233 <+85>: mov rdx,QWORD PTR [rip+0x2e26] # 0x555555558060 <stdin##GLIBC_2.2.5>
0x000055555555523a <+92>: mov rax,QWORD PTR [rbp-0x10]
0x000055555555523e <+96>: mov esi,0xc
0x0000555555555243 <+101>: mov rdi,rax
0x0000555555555246 <+104>: call 0x555555555040 <fgets#plt>
0x000055555555524b <+109>: mov rax,QWORD PTR [rbp-0x10]
0x000055555555524f <+113>: mov rdx,QWORD PTR [rbp-0x8]
0x0000555555555253 <+117>: mov rdi,rdx
0x0000555555555256 <+120>: call rax
0x0000555555555258 <+122>: nop
0x0000555555555259 <+123>: leave
0x000055555555525a <+124>: ret
Register state right before shellcode is executed:
$rax : 0x0000000000010000 → "EXPLOIT\n"
$rbx : 0x0000555555555260 → <__libc_csu_init+0> push r15
$rcx : 0x000055555555a4e8 → 0x0000000000000000
$rdx : 0x0000555555559480 → "SUPER SECRET!"
$rsp : 0x00007fffffffd940 → 0x0000000000010000 → "EXPLOIT\n"
$rbp : 0x00007fffffffd950 → 0x0000000000000000
$rsi : 0x4f4c5058
$rdi : 0x00007ffff7fa34d0 → 0x0000000000000000
$rip : 0x0000555555555253 → <main+117> mov rdi, rdx
$r8 : 0x0000000000010000 → "EXPLOIT\n"
$r9 : 0x7c
$r10 : 0x000055555555448f → "mprotect"
$r11 : 0x246
$r12 : 0x00005555555550a0 → <_start+0> xor ebp, ebp
$r13 : 0x00007fffffffda40 → 0x0000000000000001
$r14 : 0x0
$r15 : 0x0
(This register state is a snapshot at the assembly line below)
●→ 0x555555555253 <main+117> mov rdi, rdx
0x555555555256 <main+120> call rax

Since I already spilled the beans and "spoiled" the answer to the online challenge in comments, I might as well write it up. 2 key tricks:
Create 0x7ffff7e3c5a0 (&puts) in a register with lea reg, [reg + disp32], using the known value of RDI which is within the +-2^31 range of a disp32. (Or use RBP as a starting point, but not RSP: that would need a SIB byte in the addressing mode).
This is a generalization of the code-golf trick of lea edi, [rax+1] trick to create small constants from other small constants (especially 0) in 3 bytes, with code that runs less slowly than push imm8 / pop reg.
The disp32 is large enough to not have any zero bytes; you have a couple registers to choose from in case one had been too close.
Copy a 64-bit register in 2 bytes with push reg / pop reg, instead of 3-byte mov rdi, rdx (REX + opcode + modrm). No savings if either push needs a REX prefix (for R8..R15), and actually costs bytes if both are "non-legacy" registers.
See other answers on Tips for golfing in x86/x64 machine code on codegolf.SE for more.
bits 64
lea rsi, [rdi - 0x166f30]
;; add rbp, imm32 ; alternative, but that would mess up a call-preserved register so we might crash on return.
push rdx
pop rdi ; copy RDX to first arg, x86-64 SysV calling convention
jmp rsi ; tailcall puts
This is exactly 11 bytes, and I don't see a way for it to be smaller. add r64, imm32 is also 7 bytes, same as LEA. (Or 6 bytes if the register is RAX, but even the xchg rax, rdi short form would cost 2 bytes to get it there, and the RAX value is still the fgets return value, which is the small mmap buffer address.)
The puts function pointer doesn't fit in 32 bits, so we need a REX prefix on any instruction that puts it into a register. Otherwise we could just mov reg, imm32 (5 bytes) with the absolute address, not deriving it from another register.
$ nasm -fbin -o exploit.bin -l /dev/stdout exploit.asm
1 bits 64
2 00000000 488DB7D090E9FF lea rsi, [rdi - 0x166f30]
3 ;; add rbp, imm32 ; we can avoid messing up any call-preserved registers
4 00000007 52 push rdx
5 00000008 5F pop rdi ; copy to first arg
6 00000009 FFE6 jmp rsi ; tailcall
$ ll exploit.bin
-rw-r--r-- 1 peter peter 11 Apr 24 04:09 exploit.bin
$ ./a.out < exploit.bin # would work if the addresses in my build matched yours
My build of your incomplete .c uses different addresses on my machine, but it does reach this code (at address 0x10000, mmap_min_addr which mmap picks after the amusing choice of 0x1337 as a hint address, which isn't even page aligned but doesn't result in EIVAL on current Linux.)
Since we only tailcall puts with correct stack alignment and don't modify any call-preserved registers, this should successfully return to main.
Note that 0 bytes (ASCII NUL, not NULL) would actually work in shellcode for this test program, if not for the requirement that forbids it.
The input is read using fgets (apparently to simulate a gets() overflow).
fgets actually can read a 0 aka '\0'; the only critical character is 0xa aka '\n' newline. See Is it possible to read null characters correctly using fgets or gets_s?
Often buffer overflows exploit a strcpy or something else that stops on a 0 byte, but fgets only stops on EOF or newline. (Or the buffer size, a feature gets is missing, hence its deprecation and removal from even the ISO C standard library! It's literally impossible to use safely unless you control the input data). So yes, it's totally normal to forbid zero bytes.
BTW, your int 0x80 attempt is not viable: What happens if you use the 32-bit int 0x80 Linux ABI in 64-bit code? - you can't use the 32-bit ABI to pass 64-bit pointers to write, and the string you want to output is not in the low 32 bits of virtual address space.
Of course, with the 64-bit syscall ABI, you're fine if you can hardcode the length.
push rdx
pop rsi
shr eax, 16 ; fun 3-byte way to turn 0x10000` into `1`, __NR_write 64-bit, instead of just push 1 / pop
mov edi, eax ; STDOUT_FD = __NR_write
lea edx, [rax + 13 - 1] ; 3 bytes. RDX = 13 = string length
; or mov dl, 0xff ; 2 bytes leaving garbage in rest of RDX
syscall
But this is 12 bytes, as well as hard-coding the length of the string (which was supposed to be part of the secret?).
mov dl, 0xff could make sure the length was at least 255, and actually much more in this case, if you don't mind getting reams of garbage after the string you want, until write hits an unmapped page and returns early. That would save a byte, making this 11.
(Fun fact, Linux write does not return an error when it's successfully written some bytes; instead it returns how many it did write. If you try again with buf + write_len, you would get a -EFAULT return value for passing a bad pointer to write.)

lock cmpxchg fails to execute threads in core order

The following 64-bit NASM code uses lock cmpxchg to take each core in core order, execute some code, then reset the core number variable using xchg so the next core can execute the code. The core number for each core is stored in rbx -- the four cores are numbered 0, 8, 16 and 24. The variable [spin_lock_core] starts at zero and when each core is finished it updates the core number by 8 at the final line xchg [spin_lock_core],rax.
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
But before the code reaches xchg [spin_lock_core],rax the first core loops out of the program (jmp label_899), which should cause the other threads to freeze because they would be waiting for the [spin_lock_core] var to be updated, which never happens. But instead all four cores are written to the output array extra_test_array, which is displayed on the terminal when the program exits. In other words, this fails to stop the cores until the core number is updated.
The full, minimal code is below (as minimal as NASM can be in this case). The code is written for a shared object, and it's reproducible if it gets an input array (as written it doesn't matter if the input array is int or float):
; Header Section
[BITS 64]
[default rel]
global Main_Entry_fn
extern pthread_create, pthread_join, pthread_exit, pthread_self, sched_getcpu
global FreeMem_fn
extern malloc, realloc, free
extern sprintf
section .data align=16
X_ctr: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
XMM_Stack: dq 0, 0, 0, 0, 0, 0, 0
ThreadID: dq 0
X_ptr: dq 0
X_length: dq 0
X: dq 0
collect_ptr: dq 0
collect_length: dq 0
collect_ctr: dq 0
even_squares_list_ptrs: dq 0, 0, 0, 0
even_squares_list_ctr: dq 0
even_squares_list_length: dq 0
Number_Of_Cores: dq 32
pthread_attr_t: dq 0
pthread_arg: dq 0
Join_Ret_Val: dq 0
tcounter: dq 0
sched_getcpu_array: times 4 dq 0
ThreadIDLocked: dq 0
spin_lock_core: dq 0
extra_test_array: dq 0
; __________
section .text
Init_Cores_fn:
; _____
; Create Threads
label_0:
mov rdi,ThreadID ; ThreadCount
mov rsi,pthread_attr_t ; Thread Attributes
mov rdx,Test_fn ; Function Pointer
mov rcx,pthread_arg
call pthread_create wrt ..plt
mov rdi,[ThreadID] ; id to wait on
mov rsi,Join_Ret_Val ; return value
call pthread_join wrt ..plt
mov rax,[tcounter]
add rax,8
mov [tcounter],rax
mov rbx,[Number_Of_Cores]
cmp rax,rbx
jl label_0
; _____
jmp label_900 ; All threads return here, and exit
; ______________________________________
Test_fn:
; Get the core number
call sched_getcpu wrt ..plt
mov rbx,8 ; multiply by 8
mul rbx
push rax
pop rax
mov rbx,rax
push rax
Spin_lock:
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
jmp label_899
mov rax,rbx
add rax,8
xchg [spin_lock_core],rax
;__________
label_899:
pop rax
ret
; __________
label_900:
mov rdi,extra_test_array ;audit_array
mov rax,rdi
ret
;__________
;Free the memory
FreeMem_fn:
;The pointer is passed back in rcx (of course)
sub rsp,40
call free wrt ..plt
add rsp,40
ret
; __________
; Main Entry
Main_Entry_fn:
push rdi
push rbp
push rbx
push r15
xor r15,r15
push r14
xor r14,r14
push r13
xor r13,r13
push r12
xor r12,r12
push r11
xor r11,r11
push r10
xor r10,r10
push r9
xor r9,r9
push r8
xor r8,r8
movsd [XMM_Stack+0],xmm13
movsd [XMM_Stack+8],xmm12
movsd [XMM_Stack+16],xmm11
movsd [XMM_Stack+24],xmm15
movsd [XMM_Stack+32],xmm14
movsd [XMM_Stack+40],xmm10
mov [X_ptr],rdi
mov [data_master_ptr],rsi
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; Write variables to assigned registers
mov r15,0
lea rdi,[rel collect_ptr]
mov r14,qword[rdi]
mov r13,[collect_ctr]
mov r12,[collect_length]
lea rdi,[rel X_ptr]
mov r11,qword[rdi]
mov r10,[X_length]
; __________
call Init_Cores_fn
movsd xmm10,[XMM_Stack+0]
movsd xmm14,[XMM_Stack+8]
movsd xmm15,[XMM_Stack+16]
movsd xmm11,[XMM_Stack+24]
movsd xmm12,[XMM_Stack+32]
movsd xmm13,[XMM_Stack+40]
pop r8
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop rbx
pop rbp
pop rdi
ret
The instruction "lock cmpxchg" should fail until the [spin_lock_core] variable is updated, but it doesn't do that.
Thanks for any help in understanding why lock cmpxchg doesn't prevent the cores after core zero from firing in this area of code.
UPDATE: other research shows that xor rax,rax is needed at the top of the Spin_lock: section. When I insert that line, it reads like this:
Spin_lock:
xor rax,rax
lock cmpxchg [spin_lock_core],rbx
jnz Spin_lock
With that change it freezes, as expected. But when I remove the line jmp label_899 it still freezes, but it shouldn't do that.
EDIT 122219:
Based on the comments on this question yesterday, I revised the spinlock code to (1) eliminate atomic operations in favor of faster mov and cmp instructions, (2) assign a unique memory location to each core, and (3) separate the memory locations by > 256 bytes to avoid memory on the same cache line.
Each core's memory location will be changed to 1 when the previous core is finished. When each core finishes, it sets its own memory location back to 0.
The code successfully executes core 0 IF I have all other cores loop out before the spinlock. When I let all four cores run through the spinlock, the program again hangs.
I've verified that each separate memory location is set to 1 when the previous core is finished.
Here's the updated spinlock section:
section .data
spin_lock_core: times 140 dq 0
spin_lock_core_offsets: dq 0,264,528,792
section .text
; Calculate the offset to spin_lock_core
mov rbp,spin_lock_core
mov rdi,spin_lock_core_offsets
mov rax,[rdi+rbx]
add rbp,rax
; ________
Spin_lock:
pause
cmp byte[rbp],1
jnz Spin_lock
xor rax,rax
mov [rbp],rax ; Set current memory location to zero
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rdx
mov rcx,rax
; Loop out if this is the last core
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge label_899
; Set next core to 1 by adding 264 to the base address
add rbp,264
mov rax,1
mov [rbp],rax
Why does this code still hang?

I don't think you should use cmpxchg for this at all. Try this:
Spin_lock:
pause
cmp [spin_lock_core],rbx
jnz Spin_lock
; Test
mov rbp,extra_test_array
mov [rbp+rbx],rbx
; Execute some code before looping out
mov rax,1234
mov rdx,23435
add rax,rbx
mov rcx,rax
;jmp label_899
lea rax,[rbx+8]
mov [spin_lock_core],rax

I solved this spinlock problem, but after Peter Cordes' comment below I see that it is not correct. I won't delete this answer because I hope it can lead to the solution.
I use lock cmpxchg [rbp+rbx],rbx, which assembles without error, but the NASM assembler should return a "invalid combination of operands" error because the source operand can only be rax, so it shouldn't assemble with any other register. I also note that the online resources (for example, https://www.felixcloutier.com/x86/cmpxchg) show the format as CMPXCHG r/m64,r64, but the source operand can't be any r64 -- it must be rax, as that entry goes on to say.
Without the "mov rax,rbx" line it works because on the first iteration the rax register is set to 0 which matches the memory location. On the second iteration it succeeds by default.
When I add "mov rax,rbx" -- which resets rax -- the program once again hangs. I would really appreciate any ideas on why this program should hang as written.
At the start of this block rbx is the core number:
section .data
spin_lock_core: times 4 dq 0
section .text
[ Code leading up to this spinlock section shown above ]
mov rbp,spin_lock_core
Spin_lock:
pause
mov rax,rbx
lock cmpxchg [rbp+rbx],rax
jnz Spin_lock
mov rax,rbx
add rax,8
cmp rax,[Number_Of_Cores]
jge spin_lock_out
xchg [rbp+rax],rax
spin_lock_out:
The differences from my original post are:
Each core spins on (and reads from) its own unique memory location.
I use the "pause" instruction on the spinlock.
Each unique memory location is updated in core order.
But it does not work when I include mov rax,rbx. Intuitively that should work, so I will really appreciate any ideas on why it doesn't in this case.

Writing to allocated memory with sbrk results in segfault [duplicate]

I am trying to dynamically allocate memory into the heap and then assign values in those memory addresses. I understand how to allocate the memory but how would I assign for example the value in a register to that first dynamic memory address?
This is what I have so far:
push rbp
mov rbp, rsp ;initialize an empy stack to create activation records for the rest of the subroutines
mov rax, 0x2d ;linux system call for brk()
mov rbx, 0x0 ;to get the adress of the first adress we are allocating we must have 0 in rbx
int 0x80 ;calls the linux operating system kernel for assistance
mov [brk_firstLocation], rax ;the first position in the heap will be returned in rax thus i save the first loaction in a varable called brk_firstLocation
mov rbx, rax ;the memory adress of the start of the heap is moved in rbx
add rbx, 0x14 ;we want 5 bytes worth of data alocated in the heap, so the start adress plus 20 bits
mov rax, 0x2d ;linux system call for brk()
int 0x80 ;calls the linux operating system kernel for assistance
What would I do, for example, to mov the value in rax into brk_firstLocation

others have pointed out a few things that are wrong with your code. I would like to add that you would not add 20 bits to the current breakpoint (or 20 bytes like add rbx, 20 actually does), you would simply add 5 bytes.
Also, your first syscall argument will not be in rbx, it will be in rdi. The 64-bit syscall ABI uses different system call numbers, different registers, and a different instruction (syscall instead of int 0x80) than the 32-bit ABI (which is still available in 64-bit processes). See also the x86 tag wiki for more ABI links.
Here's how your code should look:
push rbp
mov rbp, rsp
;; sys_brk(0)
mov rax, 12 ; 12 is SYS_brk (/usr/include/asm/unistd_64.h)
mov rdi, 0 ; rdi for first syscall arg in the 64-bit ABI, not rbx
syscall ; syscall, not int 0x80, for the 64-bit ABI
mov qword [brk_firstLocation], rax
;; sys_brk(old_break + 5)
lea rdi, [rax + 5] ; add 5 bytes to the break point
mov rax, 12
syscall ; set the new breakpoint
At this point you can use brk_firstLocation as a pointer to whatever 5 byte struct you want to store on the heap. Here's how you would put values in that memory space:
mov rdi, [brk_firstLocation] ; load the pointer from memory, if you didn't already have it in a register
mov byte [rdi], 'A' ; a char in the first byte
mov [rdi+1], ecx ; a 32-bit value in the last 4 bytes.

int 80h is only for 32 bit system calls. Use syscall for 64 bit instead.
Calling sys_brk twice is redundant - in assembly you always know where your program data ends. Simply put a label there and you will have the address.
Allocating this way memory less than one page is pointless, it will be allocated in blocks of 4KB anyway.
It is important to be understood - sys_brk is not heap management function. It is low level memory management.

There are a number of problems I see:
0x2d is the brk system call on x86 (32 bit); on x86_64 it's 0xc
brk sets the end of the data segment; it returns 0 on success and -1 on failure. It does not return "the first position in the heap". That comes from the symbol _end which the linker sets to the end of the uninitialized preallocated data.
So you want something like:
mov [brk_firstloaction], _end
mov rbx, [brk_firstlocation]
add rbx, 0x14 ; space for 5 dwords (20 bytes)
mov rax, 12
int 0x80

As you have done, call once to retrieve the current bottom of heap, then move the top of heap (which is the brk value). However your code is not correct in using the 64-bit register set r.x. If your Linux is 32-bit (as implied by the use of 45 for the syscall number), then you want the 32-bit register set:
mov eax, 45 ; brk
mov ebx, 0 ; arg 1: 0 = fail returning brk value in rax
int 0x80 ; syscall
mov dword ptr [brk_firstLocation], rax ; save result
mov eax, 45 ; brk
mov ebx, 4 ; arg 1: allocate 4 bytes for a 32-bit int
int 0x80 ; syscall
mov eax, dword ptr [brk_firstLocation] ; reload rax with allocated memory address.
mov dword ptr [eax], 42 ; use result: store 42 in newly allocated storage
Of course you can re-load the saved value for re-use as many times as needed.

Assembly x86 brk() call use

I am trying to dynamically allocate memory into the heap and then assign values in those memory addresses. I understand how to allocate the memory but how would I assign for example the value in a register to that first dynamic memory address?
This is what I have so far:
push rbp
mov rbp, rsp ;initialize an empy stack to create activation records for the rest of the subroutines
mov rax, 0x2d ;linux system call for brk()
mov rbx, 0x0 ;to get the adress of the first adress we are allocating we must have 0 in rbx
int 0x80 ;calls the linux operating system kernel for assistance
mov [brk_firstLocation], rax ;the first position in the heap will be returned in rax thus i save the first loaction in a varable called brk_firstLocation
mov rbx, rax ;the memory adress of the start of the heap is moved in rbx
add rbx, 0x14 ;we want 5 bytes worth of data alocated in the heap, so the start adress plus 20 bits
mov rax, 0x2d ;linux system call for brk()
int 0x80 ;calls the linux operating system kernel for assistance
What would I do, for example, to mov the value in rax into brk_firstLocation

others have pointed out a few things that are wrong with your code. I would like to add that you would not add 20 bits to the current breakpoint (or 20 bytes like add rbx, 20 actually does), you would simply add 5 bytes.
Also, your first syscall argument will not be in rbx, it will be in rdi. The 64-bit syscall ABI uses different system call numbers, different registers, and a different instruction (syscall instead of int 0x80) than the 32-bit ABI (which is still available in 64-bit processes). See also the x86 tag wiki for more ABI links.
Here's how your code should look:
push rbp
mov rbp, rsp
;; sys_brk(0)
mov rax, 12 ; 12 is SYS_brk (/usr/include/asm/unistd_64.h)
mov rdi, 0 ; rdi for first syscall arg in the 64-bit ABI, not rbx
syscall ; syscall, not int 0x80, for the 64-bit ABI
mov qword [brk_firstLocation], rax
;; sys_brk(old_break + 5)
lea rdi, [rax + 5] ; add 5 bytes to the break point
mov rax, 12
syscall ; set the new breakpoint
At this point you can use brk_firstLocation as a pointer to whatever 5 byte struct you want to store on the heap. Here's how you would put values in that memory space:
mov rdi, [brk_firstLocation] ; load the pointer from memory, if you didn't already have it in a register
mov byte [rdi], 'A' ; a char in the first byte
mov [rdi+1], ecx ; a 32-bit value in the last 4 bytes.

int 80h is only for 32 bit system calls. Use syscall for 64 bit instead.
Calling sys_brk twice is redundant - in assembly you always know where your program data ends. Simply put a label there and you will have the address.
Allocating this way memory less than one page is pointless, it will be allocated in blocks of 4KB anyway.
It is important to be understood - sys_brk is not heap management function. It is low level memory management.

There are a number of problems I see:
0x2d is the brk system call on x86 (32 bit); on x86_64 it's 0xc
brk sets the end of the data segment; it returns 0 on success and -1 on failure. It does not return "the first position in the heap". That comes from the symbol _end which the linker sets to the end of the uninitialized preallocated data.
So you want something like:
mov [brk_firstloaction], _end
mov rbx, [brk_firstlocation]
add rbx, 0x14 ; space for 5 dwords (20 bytes)
mov rax, 12
int 0x80

As you have done, call once to retrieve the current bottom of heap, then move the top of heap (which is the brk value). However your code is not correct in using the 64-bit register set r.x. If your Linux is 32-bit (as implied by the use of 45 for the syscall number), then you want the 32-bit register set:
mov eax, 45 ; brk
mov ebx, 0 ; arg 1: 0 = fail returning brk value in rax
int 0x80 ; syscall
mov dword ptr [brk_firstLocation], rax ; save result
mov eax, 45 ; brk
mov ebx, 4 ; arg 1: allocate 4 bytes for a 32-bit int
int 0x80 ; syscall
mov eax, dword ptr [brk_firstLocation] ; reload rax with allocated memory address.
mov dword ptr [eax], 42 ; use result: store 42 in newly allocated storage
Of course you can re-load the saved value for re-use as many times as needed.

Segmentation fault ASM on linux at printf

The following is a program from a book (Introduction to 64 Bit Intel Assembly Language Programming for Linux, by Seyfarth, 2012), chap 9. The fault (in gdb) is:
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff7aa10a5 in __printf_size (fp=0x400400, info=0x0,
args=) at printf_size.c:199
199 printf_size.c: No such file or directory.
Until this chapter, I successfully used the following to "produce an object file", as recommended,
yasm -f elf64 -g dwarf2 -l exit.lst exit.asm
and then,
ld -o prgm prgm.o
This is the program as copied from the book(l 10 push rbp; I had firstly rem'd the ; but had the same result):
segment .text
global main
extern printf
; void print_max ( long a, long b )
; {
a equ 0
b equ 8
print_max:
push rbp; ;normal stack frame
mov rbp, rsp
; leave space for a, b and max
sub rsp, 32
; int max;
max equ 16
mov [rsp+a], rdi ; save a
mov [rsp+b], rsi ; save b
; max = a;
mov [rsp+max], rdi
; if ( b > max ) max = b;
cmp rsi, rdi
jng skip
mov [rsp+max], rsi
skip:
; printf ( "max(%1d,%1d ) = %1d\n",
; a, b, max );
segment .data
fmt db 'max(%1d,%1d) = %1d',0xa,0
segment .text
lea rdi, [fmt]
mov rsi, [rsp+a]
mov rdx, [rsp+b]
mov rcx, [rsp+max]
call printf
; }
leave
ret
main:
push rbp
mov rbp, rsp
; print_max ( 100, 200 );
mov rdi, 100 ;first parameter
mov rsi, 200 ;second parameter
call print_max
xor eax, eax ;to return 0
leave
ret
After a similar segmentation fault with a previous program in this chap ("Hello World" example), I used
gcc -o prgm prgm.o
which had worked until this program.

using gcc to link is the easiest way to go if you are going to use functions from the C Library, since gcc takes care of a few things for you "behind the scenes".
To use just ld, you need to link against ld-linux-x86-64.so.2 and pass it -lc to link to the C Library.
Next, you are using printf wrong. If you are not using floating point registers (which you are not) you need to "zero out" rax.
Also, since you are linking against the C Library, you cannot just ret from the main but call exit.
lea rdi, [fmt]
mov rsi, [rsp+a]
mov rdx, [rsp+b]
mov rcx, [rsp+max]
xor rax, rax ; # of floating point registers used.
call printf
and:
; print_max ( 100, 200 );
mov rdi, 100 ;first parameter
mov rsi, 200 ;second parameter
call print_max
xor eax, eax ;to return 0
leave
xor rdi, rdi
call exit
ld -o $(APP) $(APP).o -lc -I/lib64/ld-linux-x86-64.so.2
and the output:
max(100,200) = 200

Gunner gave an excellent summary. The program should have placed a 0 in rax. This can be done using "xor eax, eax" which is the normal way to zero out a register in x86-64 mode. The top half of the register is zeroed out with xor with a 32 bit register and the lower half depends on the the bits of the 2 registers used (with eax, eax the result is 0).

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string