Rustc/LLVM generates faulty code for aarch64 with opt-level=0 - rust

I have two files which are assembled/compiled/linked into minimalistic kernel.
start.s:
.set CPACR_EL1_FPEN, 0b11 << 20
.set BOOT_STACK_SIZE, 8 * 1024
.global __boot_stack
.global __start
.global __halt
.bss
.align 16
__boot_stack:
.fill BOOT_STACK_SIZE
.text
__start:
/* disable FP and SIMD traps */
mov x0, #CPACR_EL1_FPEN
msr cpacr_el1, x0
/* set stack */
adr x0, __boot_stack
add sp, x0, #BOOT_STACK_SIZE
/* call the Rust entry point */
bl __boot
__halt:
/* halt CPU */
wfi
b __halt
boot.rs:
#[no_mangle]
pub extern fn __boot() {
unsafe {
let ptr = 0x9000000 as *mut u8;
*ptr = '!' as u8;
}
}
For opt-level=3 the resulting code outputs single '!' to a serial port (as was intended). For opt-level=0 I have a strange infinite loop (e.g. '!!!!!!!!!....'). Here is the disassembled dump of the problematic code:
0000000000000000 <__kernel_begin>:
0: d2a00600 mov x0, #0x300000 // #3145728
4: d5181040 msr cpacr_el1, x0
8: 100007c0 adr x0, 100 <__boot_stack>
c: 9140081f add sp, x0, #0x2, lsl #12
10: 94000003 bl 1c <__boot>
0000000000000014 <__halt>:
14: d503207f wfi
18: 17ffffff b 14 <__halt>
000000000000001c <__boot>:
1c: a9bf7bfd stp x29, x30, [sp,#-16]!
20: 910003fd mov x29, sp
24: 94000003 bl 30 <aarch64::boot::__boot::__rust_abi>
28: a8c17bfd ldp x29, x30, [sp],#16
2c: d65f03c0 ret
0000000000000030 <aarch64::boot::__boot::__rust_abi>:
30: d10043ff sub sp, sp, #0x10
34: 52a12008 mov w8, #0x9000000 // #150994944
38: 2a0803e9 mov w9, w8
3c: f90007e9 str x9, [sp,#8]
40: 52800428 mov w8, #0x21 // #33
44: 39000128 strb w8, [x9]
48: 910043ff add sp, sp, #0x10
4c: d65f03c0 ret
The code is tested using qemu-system-aarch64. I don't see serious problems with it (except redundancy). Can you suggest a possible cause of such abnormal behaviour?
P.S. This is the optimised version which works properly:
0000000000000000 <__kernel_begin>:
0: d2a00600 mov x0, #0x300000 // #3145728
4: d5181040 msr cpacr_el1, x0
8: 1007ffc0 adr x0, 10000 <__boot_stack>
c: 9140081f add sp, x0, #0x2, lsl #12
10: 94000003 bl 1c <__boot>
0000000000000014 <__halt>:
14: d503207f wfi
18: 17ffffff b 14 <__halt>
000000000000001c <__boot>:
1c: 52a12008 mov w8, #0x9000000 // #150994944
20: 52800429 mov w9, #0x21 // #33
24: 39000109 strb w9, [x8]
28: d65f03c0 ret

I've succeeded to run the non-optimised code without abnormalities. Thanks to Notlikethat for the idea. My stack was just mapped into readonly memory.
So I've just added the offset statement into my linker script (". = 1024M;") in order to make all the symbols to start from 1GiB (where RAM begins). After this modification the code started to work properly.

Related

Distinguish the existence of two separate strings

I have an assembly source file, named: helloworld.s:
.global _start
_start: mov X0, #1
ldr X1, =helloworld
mov X2, #13
mov X8, #64
svc 0
mov X0, #0
mov X8, #93
svc 0
.data
helloworld: .ascii "Hello nice warm World"
.ascii "Hello nice warm World2"
I created an executable file:
/usr/aarch64-linux-gnu/bin/as -o helloworld.o helloworld.s
/usr/aarch64-linux-gnu/bin/ld -o helloworld helloworld.o
and then, created an objdump output of the executable file:
/usr/aarch64-linux-gnu/bin/objdump -s -D helloworld > objdum_output_helloworld.txt
This gives:
helloworld: file format elf64-littleaarch64
Contents of section .text:
...
4000b0 200080d2 e1000058 a20180d2 080880d2 ......X........
4000c0 010000d4 000080d2 a80b80d2 010000d4 ................
4000d0 d8004100 00000000 ..A.....
...
Contents of section .data:
...
4100d8 48656c6c 6f206e69 63652077 61726d20 Hello nice warm
4100e8 576f726c 6448656c 6c6f206e 69636520 WorldHello nice
4100f8 7761726d 20576f72 6c6432 warm World2
...
Disassembly of section .text:
00000000004000b0 <_start>:
...
4000b0: d2800020 mov x0, #0x1 // #1
4000b4: 580000e1 ldr x1, 4000d0 <_start+0x20>
4000b8: d28001a2 mov x2, #0xd // #13
4000bc: d2800808 mov x8, #0x40 // #64
4000c0: d4000001 svc #0x0
4000c4: d2800000 mov x0, #0x0 // #0
4000c8: d2800ba8 mov x8, #0x5d // #93
4000cc: d4000001 svc #0x0
4000d0: 004100d8 .inst 0x004100d8 ; undefined
4000d4: 00000000 .inst 0x00000000 ; undefined
...
Disassembly of section .data:
00000000004100d8 <helloworld>:
...
4100d8: 6c6c6548 ldnp d8, d25, [x10, #-320]
4100dc: 696e206f ldpsw x15, x8, [x3, #-144]
4100e0: 77206563 .inst 0x77206563 ; undefined
4100e4: 206d7261 .inst 0x206d7261 ; undefined
4100e8: 6c726f57 ldnp d23, d27, [x26, #-224]
4100ec: 6c654864 ldnp d4, d18, [x3, #-432]
4100f0: 6e206f6c umin v12.16b, v27.16b, v0.16b
4100f4: 20656369 .inst 0x20656369 ; undefined
4100f8: 6d726177 ldp d23, d24, [x11, #-224]
4100fc: 726f5720 .inst 0x726f5720 ; undefined
410100: Address 0x0000000000410100 is out of bounds.
The question:
How can I see from the objdump output only, the existence of two separate strings:
"Hello nice warm World"
and
"Hello nice warm World2" ?
Thanks

Identify syscall in arm

When we looking on assembler at x86 CPU , syscall look like:
0F 05 syscall ; LINUX - sys_nanosleep
48 3D 01 F0 FF FF cmp rax, 0FFFFFFFFFFFFF001h
When are we talking about ARM CPU what is the convention how syscall looks like in assembler?
The source code for the musl libc library may help: all supported architectures have a small header file implementing the 'syscalls'.
x86_64:
static __inline long __syscall0(long n)
{
unsigned long ret;
__asm__ __volatile__ ("syscall" : "=a"(ret) : "a"(n) : "rcx", "r11", "memory");
return ret;
}
Arm:
#ifdef __thumb__
/* Avoid use of r7 in asm constraints when producing thumb code,
* since it's reserved as frame pointer and might not be supported. */
#define __ASM____R7__
#define __asm_syscall(...) do { \
__asm__ __volatile__ ( "mov %1,r7 ; mov r7,%2 ; svc 0 ; mov r7,%1" \
: "=r"(r0), "=&r"((int){0}) : __VA_ARGS__ : "memory"); \
return r0; \
} while (0)
#else
#define __ASM____R7__ __asm__("r7")
#define __asm_syscall(...) do { \
__asm__ __volatile__ ( "svc 0" \
: "=r"(r0) : __VA_ARGS__ : "memory"); \
return r0; \
} while (0)
#endif
Aarch64:
#define __asm_syscall(...) do { \
__asm__ __volatile__ ( "svc 0" \
: "=r"(x0) : __VA_ARGS__ : "memory", "cc"); \
return x0; \
} while (0)
Example for generated code:
/* syscall.c */
#define __asm_syscall(...) do { \
__asm__ __volatile__ ( "svc 0" \
: "=r"(x0) : __VA_ARGS__ : "memory", "cc"); \
return x0; \
} while (0)
static inline long __syscall0(long n)
{
register long x8 __asm__("x8") = n;
register long x0 __asm__("x0");
__asm_syscall("r"(x8));
}
void test(void) {
__syscall0(1);
}
/opt/arm/9/gcc-arm-9.2-2019.12-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-objdump -D syscall.o
/opt/arm/9/gcc-arm-9.2-2019.12-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc -c -o syscall.o syscall.c
Disassembly of section .text:
0000000000000000 <__syscall0>:
0: d10043ff sub sp, sp, #0x10
4: f90007e0 str x0, [sp, #8]
8: f94007e8 ldr x8, [sp, #8]
c: d4000001 svc #0x0
10: 910043ff add sp, sp, #0x10
14: d65f03c0 ret
0000000000000018 <test>:
18: a9bf7bfd stp x29, x30, [sp, #-16]!
1c: 910003fd mov x29, sp
20: d2800020 mov x0, #0x1 // #1
24: 97fffff7 bl 0 <__syscall0>
28: d503201f nop
2c: a8c17bfd ldp x29, x30, [sp], #16
30: d65f03c0 ret
This being said, the arm documentation cannot really be called garbage, even though you found difficult to find the exact information you were looking for: The Exploration Tools section of their web site is excellent IMHO.
You can find the pseudo-code for the SVC instructions and their exact encodings here and here, and you even could simulate the instructions: Alastair Reid wrote a couple of fascinating articles here regarding ISA formal specifications.

Implement bubble sort by ARM assembly

I hope to use ARM to compile the function of my raspberry pie terminal: enter a number of Numbers, then make bubble sort in the program, and output the sorted result.
Compile and link no problem, but the screen appears "Illegal instruction" when executing the target file(ARM GNU style)
.globl _start
_start:
mov r4,#0
ldr r6,=src
add r6,r6,#len
outer:
ldr r1,=src
inner:
ldr r2,[r1]
ldr r3,[r1,#4]
cmp r2,r3
strgt r3,[r1]
Strgt r2,[r1,#4]
add r1,r1,#4
cmp r1,r6
blt inner
add r4,r4,#4
cmp r4,#len
suble r6,r6,#4
ble outer
stop:
mov r0,#0x18
ldr r1,=0x20026
swi 0x123456
.section .data
src:
.long 2,4,10,8,14,1,20
.equ len, 4
Now, I can use ARM assembler to implement bubble sort on my Raspberry pie without input character, the code is as follows:
.section .text
.global _start
_start:
mov r2,#10 #The number of characters output
mov r4,r2
b loop #Enter the first cycle
loop:
mov r1,pc #Point to the first position of the string
ldr r1,=str
sub r4,r4,#1
cmp r4,#0 #R4 is compared to 0, and if r4 is equal to 0,
#end up,print result
#greater than it goes into the second cycle
beq stop
mov r5,#0
b loop1 #The entrance to the second cycle
b loop
loop1:
ldrb r3,[r1] #R1 is pointing to the memory address of the value of
#the assignment to the r3 register
ldrb r6,[r1,#1]
cmp r3,r6 #r3 r6 compared
strgtb r3,[r1,#1] #If greater than r6, their values are exchanged
strgtb r6,[r1]
add r1,r1,#1 #R1 points to the next character
add r5,r5,#1
cmp r5,r4 #r5 r4 compared
bne loop1 #r5<r4 next loop1
b loop #r5=r4,Jump out of the second cycle and return to the
#first cycle
stop:
#printf str
add r2,r2,#1
mov r0,#0x1
mov r1,pc
ldr r1,=str
mov r7,#0x4
svc 1
#exit
mov r0,#0x0
mov r7,#0x1
svc 1
#Define data segment
.data
str:
.ascii "7543216890\n"
The result is:0123456789enter image description here

Can't get newline to print with ARM Linux assembly

I started with code from a Raspberry Pi assembly language book. It prints out 15 in binary as so:
00000000000000000000000000001111pi#raspberrypi:$
I wanted to add a newline at the end, so I implemented the _newline: and new: .ascii "\n" portion of the code.
I reassembled it, but the output remains the same. Did I miss something in outputting the newline?
.global _start
_start:
mov r6, #15
mov r10, #1
mov r9, r10, lsl #31
ldr r1, =string
_bits:
tst r6, r9
moveq r0, #48
movne r0, #49
str r0, [r1]
mov r8, r6
bl _write
mov r6, r8
movs r9, r9, lsr #1
bne _bits
_newline:
mov r0, #1
mov r2, #1
mov r7, #4
ldr r1, =new
swi 0
_exit:
mov r7, #1
swi 0
_write:
mov r0, #1
mov r2, #1
mov r7, #4
swi 0
bx lr
.data
string: .ascii " "
new: .ascii "\n"
The last few lines of strace output are:
write(1, "1", 11) = 1
write(1, "1", 11) = 1
write(1, "1", 11) = 1
write(1, "1", 11) = 1
write(1, "\0", 11) = 1
exit(1) =?
+++ exited with 1 +++
Your strace output is the clue: write(1, "\0", 11) = 1 shows us that you wrote a 0 byte instead of the ASCII encoding of \n.
When you str r0, [r1], you're storing 4 bytes.
The destination of that store is
.data
string: .ascii " "
new: .ascii "\n"
which is really:
.data
string: .byte ' '
new: .byte '\n'
So each time you store '0' or '1' to string, you're also writing 3 more zero bytes, clobbering your '\n' and 2 more bytes beyond the end of your data section. (It doesn't segfault because you're not right at the end of a page.)
The simplest fix is to use a single-byte store: strb r0, [r1] instead of the word-sized str.

An issue with an ARM assembly function?

I have a simple function written in ARM assembler. The first time it's ran, everything works as desired (it prints BOOT\n). However, the second time the function is executed, nothing is printed.
.globl __printTest
.text
.align 2
__printTest:
sub sp, #64 /* yes, I know this is too much */
mov r0, #66
str r0, [sp]
mov r0, #79
str r0, [sp, #1]
mov r0, #79
str r0, [sp, #2]
mov r0, #84
str r0, [sp, #3]
mov r0, #10
str r0, [sp, #4]
mov r0, #0
mov r1, sp
mov r2, #5
bl _write
add sp, #64
bx lr
What could be the issue? I suspect that this somehow screws up the buffer that it no longer works. Write is a function that calls the write syscall on Linux using the svc instruction.
The problem is that you're not saving lr.
bl _write
add sp, #64
bx lr
bl _write will overwrite lr which then points to add sp, #64, so your bx lr will just result in an endless loop on the last two instructions.
It should work if you modify your code like this:
__printTest:
push {lr}
sub sp, #64 /* yes, I know this is too much */
....
bl _write
add sp, #64
pop {pc}
As already stated in another answer, you should also use strb instead of str for byte-stores.
This function is pushing 32-bit values into unaligned stack pointer addresses. It should be using strb to write single bytes. For unaligned str, the ARM Architecture Reference Manual says:
if UnalignedSupport() || address<1:0> == ‘00’ then
MemU[address,4] = R[t];
else // Can only occur before ARMv7
MemU[address,4] = bits(32) UNKNOWN;
So depending on your configuration, you might be getting junk in your stack if you're hitting the UNKNOWN case.

Resources