Faster assembly optimized way to convert between 8-bit grayscale and RGB32 image with SSE

Faster assembly optimized way to convert between 8-bit grayscale and RGB32 image with SSE - visual-c++

I'm trying to find an optimized method for RGB8 (actually grayscale) to RGB32 image conversion.
Source is an 8 bits grey image, Destination should be an 32 bits grey image (BGRA) with 4th channel (alpha) to be ignored. Source address is not guaranteed to be 16 byte aligned, Count is a multiple of 16, Destination address is 16 byte aligned.
INPUT: 8 bits single channel grey image
OUTPUT: 32 bits BGRA (alpha channel ignored)
COUNT: Image size is a multiple of 16
CPU: x86-32 (SSE2/SSE3 allowed)
Here is my optimized assembly code. Is there an even faster way of conversion?
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
__asm {
mov esi, Source
mov edi, Destination
mov edx, Count
xor ecx, ecx
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
movdqu xmm0, xmmword ptr [esi + ecx]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
pshufb xmm1, xmm5
pshufb xmm2, xmm6
pshufb xmm3, xmm7
movntdq [edi + 0], xmm0
movntdq [edi + 16], xmm1
movntdq [edi + 32], xmm2
movntdq [edi + 48], xmm3
add edi, 64
add ecx, 16
cmp ecx, edx
jb l1
}
}
There is another approach using several PUNPCKLBW and PUNPCKHBW but that seems to be slightly slower.
Update: This is the basic non optimized algorithm:
BGRA* Destination = ...
unsigned char* Source ...
for (unsigned int i = 0; i < Size; i++) {
Destination[i].Blue = Source[i];
Destination[i].Green = Source[i];
Destination[i].Red = Source[i];
}
PS: I also tried using C code with MS VS2008 SSE compiler intrinsics. It turned out that the compiler generated a lot of unnecessary memory moves which causes the code to be 10-20% slower than pure assembly.
Update 2: This is the same code by using intrinsics only.
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 4));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 8));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 12));
for (unsigned int i = 0; i < Count / 16; i++) {
__m128i r0 = _mm_load_si128(Source + i);
_mm_stream_si128(Destination + (i * 4) + 0, _mm_shuffle_epi8(r0, m0));
_mm_stream_si128(Destination + (i * 4) + 1, _mm_shuffle_epi8(r0, m1));
_mm_stream_si128(Destination + (i * 4) + 2, _mm_shuffle_epi8(r0, m2));
_mm_stream_si128(Destination + (i * 4) + 3, _mm_shuffle_epi8(r0, m3));
}
}
Update 3: This is the compiler generated code (beautified) (Visual Studio 2012, all optimization on):
push ebp
mov ebp, esp
mov edx, dword ptr [ebp+8]
movdqa xmm1, xmmword ptr ds:[Masks + 0]
movdqa xmm2, xmmword ptr ds:[Masks + 16]
movdqa xmm3, xmmword ptr ds:[Masks + 32]
movdqa xmm4, xmmword ptr ds:[Masks + 48]
push esi
test ecx, ecx
je l2
lea esi, [ecx-1]
shr esi, 4
inc esi
l1:
mov ecx, edx
movdqu xmm0, xmmword ptr [ecx]
mov ecx, eax
movdqa xmm5, xmm0
pshufb xmm5, xmm1
movdqa xmmword ptr [ecx], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm2
movdqa xmmword ptr [eax+10h], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm3
movdqa xmmword ptr [eax+20h], xmm5
lea ecx, [eax+30h]
add edx, 10h
add eax, 40h
dec esi
pshufb xmm0, xmm4
movdqa xmmword ptr [ecx], xmm0
jne l1
l2:
pop esi
pop ebp
ret
It seems that interleaving movdqa with pshufb is some what faster.
Update 4: This seems to be the optimal hand optimized code:
__asm {
mov esi, Source
mov edi, Destination
mov ecx, Count
movdqu xmm0, xmmword ptr [esi]
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
dec ecx
lea edi, [ edi + 64 ]
lea esi, [ esi + 16 ]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
movdqa [edi - 64], xmm0
pshufb xmm1, xmm5
movdqa [edi - 48], xmm1
pshufb xmm2, xmm6
movdqa [edi - 32], xmm2
pshufb xmm3, xmm7
movdqa [edi - 16], xmm3
movdqu xmm0, xmmword ptr [esi]
ja l1
}
Update 5: This conversion algorithm uses punpck instruction. However this conversion routine is a bit slower than using masks and pushfb.
for (unsigned int i = 0; i < Count; i += 16) {
register __m128i r0 = _mm_load_si128(Source++);
register __m128i r1 = _mm_unpackhi_epi8(r0, r0);
register __m128i r2 = _mm_unpacklo_epi8(r0, r0);
register __m128i r3 = _mm_unpackhi_epi8(r1, r1);
register __m128i r4 = _mm_unpacklo_epi8(r1, r1);
register __m128i r5 = _mm_unpackhi_epi8(r2, r2);
register __m128i r6 = _mm_unpacklo_epi8(r2, r2);
_mm_store_si128(Destination++, r6);
_mm_store_si128(Destination++, r5);
_mm_store_si128(Destination++, r4);
_mm_store_si128(Destination++, r3);
}
Update 6: For the sake of completeness this is the inverse method to convert from 32 bits back to 8 bits grey image.
static void ConvertRgb32ToGrey(const __m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned char __declspec(align(64)) Masks[] = {
0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c,
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 16));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 32));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 48));
for (unsigned int i = 0; i < Count / 64; i++) {
__m128i a = _mm_load_si128(Source + (i * 4) + 0);
__m128i b = _mm_load_si128(Source + (i * 4) + 1);
__m128i c = _mm_load_si128(Source + (i * 4) + 2);
__m128i d = _mm_load_si128(Source + (i * 4) + 3);
a = _mm_shuffle_epi8(a, m0);
b = _mm_shuffle_epi8(b, m1);
c = _mm_shuffle_epi8(c, m2);
d = _mm_shuffle_epi8(d, m3);
__m128i e = _mm_or_si128(a, b);
__m128i f = _mm_or_si128(c, d);
__m128i g = _mm_or_si128(e, f);
_mm_stream_si128(Destination + i, g);
}
}

Would try:
__asm {
mov esi, Source
mov edi, Destination
mov ecx, Count
movdqu xmm0, xmmword ptr [esi]
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
dec ecx // modern Intel can macro-fuse this with jnz if adjacent
lea edi, [ edi + 64 ]
lea esi, [ esi + 16 ]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
pshufb xmm1, xmm5
pshufb xmm2, xmm6
pshufb xmm3, xmm7
movntdq [edi - 64], xmm0
movntdq [edi - 48], xmm1
movntdq [edi - 32], xmm2
movntdq [edi - 16], xmm3
movdqu xmm0, xmmword ptr [esi]
jnz l1
}
Haven't benchmarked it though; assumptions behind these changes:
the movdqu xmm0,... latency can be a little more hidden within the loop (your code has the load of xmm0 followed directly by an instruction using the value in that register)
the add ops on two regs as well as the cmp aren't really all necessary; address generation (lea) and the implicit zero test by dec/jnz can be used. That way, there'll be no EFLAGS dependencies caused by operations on ecx/esi/edi as the only ALU op in the loop is decrementing the loop counter.
In the end, this is likely load/store bound in any case so the arithmetics are "free game"; I therefore expect little difference, even with the arguments as given.
If the input is large, then it'd make sense to strip the "unaligned head/tail" off, i.e. to do a duff's device for the first/last [0..15] bytes, and the main loop using movdqa.
Edit:
Running your intrinsics sources through gcc -msse4.2 -O8 -c (GCC 4.7.1) gives the following assembly:
Disassembly of section .text:
0000000000000000 <ConvertGreyToRgb32Assembler>:
0: 85 d2 test edx,edx
2: 74 76 je 7a <ConvertGreyToRgb32Assembler+0x7a>
4: 66 0f 6f 2d 00 00 00 00 movdqa xmm5,XMMWORD PTR [rip+0x0]
# c <ConvertGreyToRgb32Assembler+0xc>
c: 48 89 f8 mov rax,rdi
f: 66 0f 6f 25 00 00 00 00 movdqa xmm4,XMMWORD PTR [rip+0x0]
# 17 <ConvertGreyToRgb32Assembler+0x17>
17: 66 0f 6f 1d 00 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0]
# 1f <ConvertGreyToRgb32Assembler+0x1f>
1f: 66 0f 6f 15 00 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
# 27 <ConvertGreyToRgb32Assembler+0x27>
27: 66 0f 1f 84 00 00 00 00 00 nop WORD PTR [rax+rax*1+0x0]
30: f3 0f 6f 00 movdqu xmm0,XMMWORD PTR [rax]
34: 48 89 f1 mov rcx,rsi
37: 48 83 c0 10 add rax,0x10
3b: 66 0f 6f c8 movdqa xmm1,xmm0
3f: 66 0f 38 00 cd pshufb xmm1,xmm5
44: 66 0f e7 0e movntdq XMMWORD PTR [rsi],xmm1
48: 66 0f 6f c8 movdqa xmm1,xmm0
4c: 66 0f 38 00 cc pshufb xmm1,xmm4
51: 66 0f e7 4e 10 movntdq XMMWORD PTR [rsi+0x10],xmm1
56: 66 0f 6f c8 movdqa xmm1,xmm0
5a: 66 0f 38 00 c2 pshufb xmm0,xmm2
5f: 66 0f 38 00 cb pshufb xmm1,xmm3
64: 66 0f e7 4e 20 movntdq XMMWORD PTR [rsi+0x20],xmm1
69: 66 0f e7 41 30 movntdq XMMWORD PTR [rcx+0x30],xmm0
6e: 89 c1 mov ecx,eax
70: 29 f9 sub ecx,edi
72: 48 83 c6 40 add rsi,0x40
76: 39 ca cmp edx,ecx
78: 77 b6 ja 30 <ConvertGreyToRgb32Assembler+0x30>
7a: f3 c3 repz ret
This reminds me extremely strongly of your initial assembly code. If MSVC creates something significantly worse than that, I'd say it's a bug/limitation in the compiler (version) you used.

Related

Slow SIMD performance - no inlining

Consider following examples for calculating sum of i32 array:
Example1: Simple for loop
pub fn vec_sum_for_loop_i32(src: &[i32]) -> i32 {
let mut sum = 0;
for c in src {
sum += *c;
}
sum
}
Example2: Explicit SIMD sum:
use std::arch::x86_64::*;
// #[inline]
pub fn vec_sum_simd_direct_loop(src: &[i32]) -> i32 {
#[cfg(debug_assertions)]
assert!(src.as_ptr() as u64 % 64 == 0);
#[cfg(debug_assertions)]
assert!(src.len() % (std::mem::size_of::<__m256i>() / std::mem::size_of::<i32>()) == 0);
let p_src = src.as_ptr();
let batch_size = std::mem::size_of::<__m256i>() / std::mem::size_of::<i32>();
#[cfg(debug_assertions)]
assert!(src.len() % batch_size == 0);
let result: i32;
unsafe {
let mut offset: isize = 0;
let total: isize = src.len() as isize;
let mut curr_sum = _mm256_setzero_si256();
while offset < total {
let curr = _mm256_load_epi32(p_src.offset(offset));
curr_sum = _mm256_add_epi32(curr_sum, curr);
offset += 8;
}
// this can be reduced with hadd.
let a0 = _mm256_extract_epi32::<0>(curr_sum);
let a1 = _mm256_extract_epi32::<1>(curr_sum);
let a2 = _mm256_extract_epi32::<2>(curr_sum);
let a3 = _mm256_extract_epi32::<3>(curr_sum);
let a4 = _mm256_extract_epi32::<4>(curr_sum);
let a5 = _mm256_extract_epi32::<5>(curr_sum);
let a6 = _mm256_extract_epi32::<6>(curr_sum);
let a7 = _mm256_extract_epi32::<7>(curr_sum);
result = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7;
}
result
}
When I tried to benchmark the code the first example got ~23GB/s (which is close to theoretical maximum for my RAM speed). Second example got 8GB/s.
When looking at the assembly with cargo asm first example translates into unrolled SIMD optimized loops:
.LBB11_7:
sum += *c;
movdqu xmm2, xmmword, ptr, [rcx, +, 4*rax]
paddd xmm2, xmm0
movdqu xmm0, xmmword, ptr, [rcx, +, 4*rax, +, 16]
paddd xmm0, xmm1
movdqu xmm1, xmmword, ptr, [rcx, +, 4*rax, +, 32]
movdqu xmm3, xmmword, ptr, [rcx, +, 4*rax, +, 48]
movdqu xmm4, xmmword, ptr, [rcx, +, 4*rax, +, 64]
paddd xmm4, xmm1
paddd xmm4, xmm2
movdqu xmm2, xmmword, ptr, [rcx, +, 4*rax, +, 80]
paddd xmm2, xmm3
paddd xmm2, xmm0
movdqu xmm0, xmmword, ptr, [rcx, +, 4*rax, +, 96]
paddd xmm0, xmm4
movdqu xmm1, xmmword, ptr, [rcx, +, 4*rax, +, 112]
paddd xmm1, xmm2
add rax, 32
add r11, -4
jne .LBB11_7
.LBB11_8:
test r10, r10
je .LBB11_11
lea r11, [rcx, +, 4*rax]
add r11, 16
shl r10, 5
xor eax, eax
Second example doesn't have any loop unrolling and doesn't even inline code to _mm256_add_epi32:
...
movaps xmmword, ptr, [rbp, +, 320], xmm7
movaps xmmword, ptr, [rbp, +, 304], xmm6
and rsp, -32
mov r12, rdx
mov rdi, rcx
lea rcx, [rsp, +, 32]
let mut curr_sum = _mm256_setzero_si256();
call core::core_arch::x86::avx::_mm256_setzero_si256
movaps xmm6, xmmword, ptr, [rsp, +, 32]
movaps xmm7, xmmword, ptr, [rsp, +, 48]
while offset < total {
test r12, r12
jle .LBB13_3
xor esi, esi
lea rbx, [rsp, +, 384]
lea r14, [rsp, +, 64]
lea r15, [rsp, +, 96]
.LBB13_2:
let curr = _mm256_load_epi32(p_src.offset(offset));
mov rcx, rbx
mov rdx, rdi
call core::core_arch::x86::avx512f::_mm256_load_epi32
curr_sum = _mm256_add_epi32(curr_sum, curr);
movaps xmmword, ptr, [rsp, +, 112], xmm7
movaps xmmword, ptr, [rsp, +, 96], xmm6
mov rcx, r14
mov rdx, r15
mov r8, rbx
call core::core_arch::x86::avx2::_mm256_add_epi32
movaps xmm6, xmmword, ptr, [rsp, +, 64]
movaps xmm7, xmmword, ptr, [rsp, +, 80]
offset += 8;
add rsi, 8
while offset < total {
add rdi, 32
cmp rsi, r12
...
This of course is pretty trivial example and I don't plan to use hand crafted SIMD for simple sum. But it still puzzles me on why explicit SIMD is so slow and why using SIMD intrinsics led to such unoptimized code.

It appears you forgot to tell rustc it was allowed to use AVX2 instructions everywhere, so it couldn't inline those functions. Instead, you get a total disaster where only the wrapper functions are compiled as AVX2-using functions, or something like that.
Works fine for me with -O -C target-cpu=skylake-avx512 (https://godbolt.org/z/csY5or43T) so it can inline even the AVX512VL load you used, _mm256_load_epi321, and then optimize it into a memory source operand for vpaddd ymm0, ymm0, ymmword ptr [rdi + 4*rax] (AVX2) inside a tight loop.
In GCC / clang, you get an error like "inlining failed in call to always_inline foobar" in this case, instead of working but slow asm. (See this for details). This is something Rust should probably sort out before this is ready for prime time, either be like MSVC and actually inline the instruction into a function using the intrinsic, or refuse to compile like GCC/clang.
Footnote 1:
See How to emulate _mm256_loadu_epi32 with gcc or clang? if you didn't mean to use AVX512.
With -O -C target-cpu=skylake (just AVX2), it inlines everything else, including vpaddd ymm, but still calls out to a function that copies 32 bytes from memory to memory with AVX vmovaps. It requires AVX512VL to inline the intrinsic, but later in the optimization process it realizes that with no masking, it's just a 256-bit load it should do without a bloated AVX-512 instruction. It's kinda dumb that Intel even provided a no-masking version of _mm256_mask[z]_loadu_epi32 that requires AVX-512. Or dumb that gcc/clang/rustc consider it an AVX512 intrinsic.

Why does this SIMD code run slower than scalar equivalent?

This is one of those n00b questions where I'm doing something wrong but I don't fully understand yet.
The xxhash32 algorithm has a nice 16 byte inner loop that can be made faster with SIMD, so, as an exercise to myself, this is what I'm trying to do.
The body of the loop looks like this (numBytes is some multiple of 16):
// C# that gets auto-vectorized. uint4 is a vector of 4 elements
uint4 state = new uint4(Prime1 + Prime2, Prime2, 0, (uint)-Prime1) + seed;
int count = numBytes >> 4;
for (int i = 0; i < count; ++i) {
state += *p++ * Prime2;
state = (state << 13) | (state >> 19);
state *= Prime1;
}
hash = rol(state.x, 1) + rol(state.y, 7) + rol(state.z, 12) + rol(state.w, 18);
I've translated this into the following SSE2/SSE4.1 intrinsics:
auto prime1 = _mm_set1_epi32(kPrime1);
auto prime2 = _mm_set1_epi32(kPrime2);
auto state = _mm_set_epi32(seed + kPrime1 + kPrime2, seed + kPrime2, seed, seed - kPrime1);
int32_t count = size >> 4; // =/16
for (int32_t i = 0; i < count; i++) {
state = _mm_add_epi32(state, _mm_mullo_epi32(_mm_loadu_si128(p128++), prime2));
state = _mm_or_si128(_mm_sll_epi32(state, _mm_cvtsi32_si128(13)), _mm_srl_epi32(state, _mm_cvtsi32_si128(19)));
state = _mm_mullo_epi32(state, prime1);
}
uint32_t temp[4];
_mm_storeu_si128(state, temp);
hash = _lrotl(temp[0], 1) + _lrotl(temp[1], 7) + _lrotl(temp[2], 12) + _lrotl(temp[3], 18);
Here's the disassembly of the inner loop body:
mov rax,qword ptr [p128]
mov qword ptr [rsp+88h],rax
mov rax,qword ptr [rsp+88h]
movdqu xmm0,xmmword ptr [rax]
movdqa xmmword ptr [rsp+90h],xmm0
movdqa xmm0,xmmword ptr [rsp+90h]
movdqa xmmword ptr [rsp+120h],xmm0
mov rax,qword ptr [p128]
add rax,10h
mov qword ptr [p128],rax
movdqa xmm0,xmmword ptr [prime2]
movdqa xmmword ptr [rsp+140h],xmm0
movdqa xmm0,xmmword ptr [rsp+120h]
movdqa xmmword ptr [rsp+130h],xmm0
movdqa xmm0,xmmword ptr [rsp+130h]
pmulld xmm0,xmmword ptr [rsp+140h]
movdqa xmmword ptr [rsp+150h],xmm0
movdqa xmm0,xmmword ptr [rsp+150h]
movdqa xmmword ptr [rsp+160h],xmm0
movdqa xmm0,xmmword ptr [rsp+160h]
movdqa xmmword ptr [rsp+170h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+100h],xmm0
movdqa xmm0,xmmword ptr [rsp+100h]
paddd xmm0,xmmword ptr [rsp+170h]
movdqa xmmword ptr [rsp+180h],xmm0
movdqa xmm0,xmmword ptr [rsp+180h]
movdqa xmmword ptr [rsp+190h],xmm0
movdqa xmm0,xmmword ptr [rsp+190h]
movdqa xmmword ptr [rsp+20h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+1A0h],xmm0
mov eax,13h
movd xmm0,eax
movdqa xmmword ptr [rsp+1B0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1A0h]
psrld xmm0,xmmword ptr [rsp+1B0h]
movdqa xmmword ptr [rsp+1C0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1C0h]
movdqa xmmword ptr [rsp+200h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+1D0h],xmm0
mov eax,0Dh
movd xmm0,eax
movdqa xmmword ptr [rsp+1E0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1D0h]
pslld xmm0,xmmword ptr [rsp+1E0h]
movdqa xmmword ptr [rsp+1F0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1F0h]
movdqa xmmword ptr [rsp+210h],xmm0
movdqa xmm0,xmmword ptr [rsp+200h]
movdqa xmmword ptr [rsp+230h],xmm0
movdqa xmm0,xmmword ptr [rsp+210h]
movdqa xmmword ptr [rsp+220h],xmm0
movdqa xmm0,xmmword ptr [rsp+220h]
por xmm0,xmmword ptr [rsp+230h]
movdqa xmmword ptr [rsp+240h],xmm0
movdqa xmm0,xmmword ptr [rsp+240h]
movdqa xmmword ptr [rsp+250h],xmm0
movdqa xmm0,xmmword ptr [rsp+250h]
movdqa xmmword ptr [rsp+20h],xmm0
movdqa xmm0,xmmword ptr [prime1]
movdqa xmmword ptr [rsp+280h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+270h],xmm0
movdqa xmm0,xmmword ptr [rsp+270h]
pmulld xmm0,xmmword ptr [rsp+280h]
movdqa xmmword ptr [rsp+290h],xmm0
movdqa xmm0,xmmword ptr [rsp+290h]
movdqa xmmword ptr [rsp+2A0h],xmm0
movdqa xmm0,xmmword ptr [rsp+2A0h]
movdqa xmmword ptr [rsp+20h],xmm0
Some questions about the disassembly:
Why so many movdqa instructions (I thought the purpose of intrinsics was that they mapped to specific hardware instructions.)?
Why is only xmm0 used, it looks to me like it is shuffling memory in and out of the vector pipeline (I'm expecting more xmmN registers to be used)?
This is compiled with Visual C++ 2017, I haven't enabled additional optimizations.
When I run these two snippets over a block of 64 MiB, many times over, the scalar code is about 3 timers faster. This is not what I expect to happen, what have I missed?

Okay, this has everything to do with compiler optimization flags and is totally Visual C++ specific.
As I enable additional compiler optimization switches the code gets so much faster.
The inner loop turns into this:
pmulld xmm0,xmm5
paddd xmm0,xmm3
movdqa xmm3,xmm0
pslld xmm3,xmm2
psrld xmm0,xmm1
por xmm3,xmm0
pmulld xmm3,xmm4
While the documentation says that /Ox is equivalent to some other switches, it wasn't until I actually compiled with /Ox or /O2 that the code ended up looking like that.
Edit: the SIMD result ended up being just 8% faster. The xxhash32 algorithm is very good superscalar code so while I expected more, this is what I got. There's some notes about this in the original source.
Some numbers from my computer (Ryzen 1700).
memcpy 11.334895 GiB/s
SIMD 5.737743 GiB/s
Scalar 5.286924 GiB/s
I was hoping to try and make the xxhash32 algorithm almost as fast as memcpy. I've seen some benchmarks that suggest that this could be improved but it's difficult to compare without having a comparable baseline, that's why I bench against my computers memcpy performance.

NASM Segmentation fault

I'm using a 64-bit Ubuntu 18.04.3 LTS VM and I'm trying to write a simple x64 assembly code that will print "Owned!!!".
Because I don't want any 0x00 or 0x0a bytes and I want the code to be position independent (because I'm learning how to write shellcodes), I wrote it this way:
;hello4.asm attempts to make the code position independent
section .text
global _start
_start:
;clear out the registers we are going to need
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
;write(int fd, char *msg, unsigned int len)
mov al, 4
mov bl, 1
;Owned!!! = 4f,77,6e,65,64,21,21,21
;push !,!,!,d
push 0x21212164
;push e,n,w,O
push 0x656e774f
mov rcx, rsp
mov dl, 8
int 0x80
;exit(int ret)
mov al,1
xor rbx, rbx
int 0x80
This is the output that I'm getting:
user#PC:~/Desktop/exploitsclass/hello_shellcode$ nasm -f elf64 hello4.asm
user#PC:~/Desktop/exploitsclass/hello_shellcode$ ld hello4.o -o hello4
user#PC:~/Desktop/exploitsclass/hello_shellcode$ objdump -d hello4 -M intel
hello4: file format elf64-x86-64
Disassembly of section .text:
0000000000400080 <_start>:
400080: 48 31 c0 xor rax,rax
400083: 48 31 db xor rbx,rbx
400086: 48 31 c9 xor rcx,rcx
400089: 48 31 d2 xor rdx,rdx
40008c: b0 04 mov al,0x4
40008e: b3 01 mov bl,0x1
400090: 68 64 21 21 21 push 0x21212164
400095: 68 4f 77 6e 65 push 0x656e774f
40009a: 48 89 e1 mov rcx,rsp
40009d: b2 08 mov dl,0x8
40009f: cd 80 int 0x80
4000a1: b0 01 mov al,0x1
4000a3: 48 31 db xor rbx,rbx
4000a6: cd 80 int 0x80
user#PC:~/Desktop/exploitsclass/hello_shellcode$ ./hello4
Segmentation fault (core dumped)
How do I fix this?
UPDATE:
I've understood that int 0x80 is intended for 32-bit programs and I should use syscall instead and that syscall has different ids for each system call.
The new code is:
;hello4.asm attempts to make the code position independent
section .text
global _start
_start:
;clear out the registers we are going to need
xor rax, rax
xor rsi, rsi
xor rdi, rdi
xor rdx, rdx
;write(int fd, char *msg, unsigned int len)
mov al, 1
add di, 1
;Owned!!! = 4f,77,6e,65,64,21,21,21
;push !,!,!,d
push 0x21212164
;push e,n,w,O
push 0x656e774f
mov rsi, rsp
mov dl, 8
syscall
;exit(int ret)
mov al, 60
xor rdi, rdi
syscall
The output is Owne% instead of Owned!!! now.
It still needs to be fixed.

With the help of #CertainLach I've written the correct code:
;hello4.asm attempts to make the code position independent
section .text
global _start
_start:
;clear out the registers we are going to need
xor rax, rax
xor rsi, rsi
xor rdi, rdi
xor rdx, rdx
;write(int fd, char *msg, unsigned int len)
mov al, 1
add di, 1
;Owned!!! = 4f,77,6e,65,64,21,21,21
mov rsi, 0x21212164656e774f
push rsi
mov rsi, rsp
mov dl, 8
syscall
;exit(int ret)
mov al, 60
xor rdi, rdi
syscall
This code contains no null bytes or 0x0a bytes and it's position-independent, as following:
user#PC:~/Desktop/exploitsclass/hello_shellcode$ objdump -d hello4 -M intel
hello4: file format elf64-x86-64
Disassembly of section .text:
0000000000400080 <_start>:
400080: 48 31 c0 xor rax,rax
400083: 48 31 f6 xor rsi,rsi
400086: 48 31 ff xor rdi,rdi
400089: 48 31 d2 xor rdx,rdx
40008c: b0 01 mov al,0x1
40008e: 66 83 c7 01 add di,0x1
400092: 48 be 4f 77 6e 65 64 movabs rsi,0x21212164656e774f
400099: 21 21 21
40009c: 56 push rsi
40009d: 48 89 e6 mov rsi,rsp
4000a0: b2 08 mov dl,0x8
4000a2: 0f 05 syscall
4000a4: b0 3c mov al,0x3c
4000a6: 48 31 ff xor rdi,rdi
4000a9: 0f 05 syscall
This is also a correct way of implementing the solution, which is 1 bytecode less, but with more memory consumption:
user#PC:~/Desktop/exploitsclass/hello_shellcode$ cat hello4.asm
;hello4.asm attempts to make the code position independent
section .text
global _start
_start:
;clear out the registers we are going to need
xor rax, rax
xor rsi, rsi
xor rdi, rdi
xor rdx, rdx
;write(int fd, char *msg, unsigned int len)
mov al, 1
add di, 1
;Owned!!! = 4f,77,6e,65,64,21,21,21
;push !,!,!,d
push 0x21212164
;push e,n,w,O
push 0x656e774f
mov rsi, rsp
mov dl, 16
syscall
;exit(int ret)
mov al, 60
xor rdi, rdi
syscall
user#PC:~/Desktop/exploitsclass/hello_shellcode$ objdump -d hello4 -M intel
hello4: file format elf64-x86-64
Disassembly of section .text:
0000000000400080 <_start>:
400080: 48 31 c0 xor rax,rax
400083: 48 31 f6 xor rsi,rsi
400086: 48 31 ff xor rdi,rdi
400089: 48 31 d2 xor rdx,rdx
40008c: b0 01 mov al,0x1
40008e: 66 83 c7 01 add di,0x1
400092: 68 64 21 21 21 push 0x21212164
400097: 68 4f 77 6e 65 push 0x656e774f
40009c: 48 89 e6 mov rsi,rsp
40009f: b2 10 mov dl,0x10
4000a1: 0f 05 syscall
4000a3: b0 3c mov al,0x3c
4000a5: 48 31 ff xor rdi,rdi
4000a8: 0f 05 syscall
Thank you so much!

Can't answer your comment, you can't just change int 0x80 to syscall to make it work, system call numbers differ, i.e sys_write you have here, have id 4 for int 0x80, and id 1 with syscall
Here you can see numbers for syscall
And here for int 0x80

Assembly - Passing parameters to a function call

I am currently practicing with assembly reading by disassemblying C programs and trying to understand what they do.
I am stuck with a trivial one: a simple hello world program.
#include <stdio.h>
#include <stdlib.h>
int main() {
printf("Hello, world!");
return(0);
}
When I disassemble the main:
(gdb) disassemble main
Dump of assembler code for function main:
0x0000000000400526 <+0>: push rbp
0x0000000000400527 <+1>: mov rbp,rsp
0x000000000040052a <+4>: mov edi,0x4005c4
0x000000000040052f <+9>: mov eax,0x0
0x0000000000400534 <+14>: call 0x400400 <printf#plt>
0x0000000000400539 <+19>: mov eax,0x0
0x000000000040053e <+24>: pop rbp
0x000000000040053f <+25>: ret
I understand the first two lines: the base pointer is saved on the stack (by push rbp, which causes the value of the stack pointer to be decreased by 8, because it has "grown") and the value of the stack pointer is saved in the base pointer (so that parameters and local variable can be easily reached through positive and negative offsets, respectively, while the stack can keep "growing").
The third line presents the first issue: why is 0x4005c4 (the address of the "Hello, World!" string) moved in the edi register instead of moving it on the stack? Shouldn't the printf function take the address of that string as parameter? For what I know, functions take parameters from the stack (but here, it looks like the parameter is put in that register: edi)
On another post here on StackOverflow I read that "printf#ptl" is like a stub function that calls the real printf function. I tried to disassemble that function, but it gets even more confusing:
(gdb) disassemble printf
Dump of assembler code for function __printf:
0x00007ffff7a637b0 <+0>: sub rsp,0xd8
0x00007ffff7a637b7 <+7>: test al,al
0x00007ffff7a637b9 <+9>: mov QWORD PTR [rsp+0x28],rsi
0x00007ffff7a637be <+14>: mov QWORD PTR [rsp+0x30],rdx
0x00007ffff7a637c3 <+19>: mov QWORD PTR [rsp+0x38],rcx
0x00007ffff7a637c8 <+24>: mov QWORD PTR [rsp+0x40],r8
0x00007ffff7a637cd <+29>: mov QWORD PTR [rsp+0x48],r9
0x00007ffff7a637d2 <+34>: je 0x7ffff7a6380b <__printf+91>
0x00007ffff7a637d4 <+36>: movaps XMMWORD PTR [rsp+0x50],xmm0
0x00007ffff7a637d9 <+41>: movaps XMMWORD PTR [rsp+0x60],xmm1
0x00007ffff7a637de <+46>: movaps XMMWORD PTR [rsp+0x70],xmm2
0x00007ffff7a637e3 <+51>: movaps XMMWORD PTR [rsp+0x80],xmm3
0x00007ffff7a637eb <+59>: movaps XMMWORD PTR [rsp+0x90],xmm4
0x00007ffff7a637f3 <+67>: movaps XMMWORD PTR [rsp+0xa0],xmm5
0x00007ffff7a637fb <+75>: movaps XMMWORD PTR [rsp+0xb0],xmm6
0x00007ffff7a63803 <+83>: movaps XMMWORD PTR [rsp+0xc0],xmm7
0x00007ffff7a6380b <+91>: lea rax,[rsp+0xe0]
0x00007ffff7a63813 <+99>: mov rsi,rdi
0x00007ffff7a63816 <+102>: lea rdx,[rsp+0x8]
0x00007ffff7a6381b <+107>: mov QWORD PTR [rsp+0x10],rax
0x00007ffff7a63820 <+112>: lea rax,[rsp+0x20]
0x00007ffff7a63825 <+117>: mov DWORD PTR [rsp+0x8],0x8
0x00007ffff7a6382d <+125>: mov DWORD PTR [rsp+0xc],0x30
0x00007ffff7a63835 <+133>: mov QWORD PTR [rsp+0x18],rax
0x00007ffff7a6383a <+138>: mov rax,QWORD PTR [rip+0x36d70f] # 0x7ffff7dd0f50
0x00007ffff7a63841 <+145>: mov rdi,QWORD PTR [rax]
0x00007ffff7a63844 <+148>: call 0x7ffff7a5b130 <_IO_vfprintf_internal>
0x00007ffff7a63849 <+153>: add rsp,0xd8
0x00007ffff7a63850 <+160>: ret
End of assembler dump.
The two mov operations on eax (mov eax, 0x0) bother me a little as well, since I don't get they role in here (but I am more concerned with what I have just described).
Thank you in advance.

gcc is targeting the x86-64 System V ABI, used by all x86-64 systems other than Windows (for various historical reasons). Its calling convention passes the first few args in registers before falling back to the stack. (See also the Wikipedia basic summary of this calling convention.)
And yes, this is different from the crusty old 32-bit calling conventions that use the stack for everything. This is a Good Thing. See also the x86 tag wiki for more links to ABI docs, and tons of other stuff.
0x0000000000400526: push rbp
0x0000000000400527: mov rbp,rsp # stack-frame boilerplate
0x000000000040052a: mov edi,0x4005c4 # first arg
0x000000000040052f: mov eax,0x0 # 0 FP args in vector registers
0x0000000000400534: call 0x400400 <printf#plt>
0x0000000000400539: mov eax,0x0 # return 0. If you'd compiled with optimization, this and the previous mov would be xor eax,eax
0x000000000040053e: pop rbp # clean up stack frame
0x000000000040053f: ret
Pointers to static data fit into 32 bits, which is why it can use mov edi, imm32 instead of movabs rdi, imm64.
Floating-point args are passed in SSE registers (xmm0-xmm7), even to var-args functions. al indicates how many FP args are in vector registers. (Note that C's type promotion rules mean that float args to variadic functions are always promoted to double, which is why printf doesn't have any format specifiers for float, only double and long double).
printf#ptl is like a stub function that calls the real printf function.
Yes, that's right. The Procedure Linking Table entry starts out as a jmp to a dynamic linker routine that resolves the symbol and modifies the code in the PLT to turn it into a jmp directly to the address where libc's printf definition is mapped. printf is a weak alias for __printf, which is why gdb chooses the __printf label for that address, after you asked for disassembly of printf.
Dump of assembler code for function __printf:
0x00007ffff7a637b0 <+0>: sub rsp,0xd8 # reserve space
0x00007ffff7a637b7 <+7>: test al,al # check if there were any FP args
0x00007ffff7a637b9 <+9>: mov QWORD PTR [rsp+0x28],rsi # store the integer arg-passing registers to local scratch space
0x00007ffff7a637be <+14>: mov QWORD PTR [rsp+0x30],rdx
0x00007ffff7a637c3 <+19>: mov QWORD PTR [rsp+0x38],rcx
0x00007ffff7a637c8 <+24>: mov QWORD PTR [rsp+0x40],r8
0x00007ffff7a637cd <+29>: mov QWORD PTR [rsp+0x48],r9
0x00007ffff7a637d2 <+34>: je 0x7ffff7a6380b <__printf+91> # skip storing the FP arg-passing regs if there were no FP args
0x00007ffff7a637d4 <+36>: movaps XMMWORD PTR [rsp+0x50],xmm0
0x00007ffff7a637d9 <+41>: movaps XMMWORD PTR [rsp+0x60],xmm1
0x00007ffff7a637de <+46>: movaps XMMWORD PTR [rsp+0x70],xmm2
0x00007ffff7a637e3 <+51>: movaps XMMWORD PTR [rsp+0x80],xmm3
0x00007ffff7a637eb <+59>: movaps XMMWORD PTR [rsp+0x90],xmm4
0x00007ffff7a637f3 <+67>: movaps XMMWORD PTR [rsp+0xa0],xmm5
0x00007ffff7a637fb <+75>: movaps XMMWORD PTR [rsp+0xb0],xmm6
0x00007ffff7a63803 <+83>: movaps XMMWORD PTR [rsp+0xc0],xmm7
branch_target_from_test_je:
0x00007ffff7a6380b <+91>: lea rax,[rsp+0xe0] # some more stuff
So printf's implementation keeps the var-args handling simple by storing all the arg-passing registers (except the first one holding the format string) in order to local arrays. It can walk a pointer through them instead of needing switch-like code to extract the right integer or FP arg. It still needs to keep track of the first 5 integer and first 8 FP args, because they aren't contiguous with the rest of the args pushed by the caller onto the stack.
The Windows 64-bit calling convention's shadow space simplifies this by providing space for a function to dump its register args to the stack contiguous with the args already on the stack, but that's not worth wasting 32 bytes of stack on every call, IMO. (See my answer and comments on other answers on Why does Windows64 use a different calling convention from all other OSes on x86-64?)

there is nothing trivial about printf, not the first choice for what you are trying to do but, turned out to be not overly complicated.
Something simpler:
extern unsigned int more_fun ( unsigned int );
unsigned int fun ( unsigned int x )
{
return(more_fun(x)+7);
}
0000000000000000 <fun>:
0: 48 83 ec 08 sub $0x8,%rsp
4: e8 00 00 00 00 callq 9 <fun+0x9>
9: 48 83 c4 08 add $0x8,%rsp
d: 83 c0 07 add $0x7,%eax
10: c3 retq
and the stack is used. eax used for the return.
now use a pointer
extern unsigned int more_fun ( unsigned int * );
unsigned int fun ( unsigned int x )
{
return(more_fun(&x)+7);
}
0000000000000000 <fun>:
0: 48 83 ec 18 sub $0x18,%rsp
4: 89 7c 24 0c mov %edi,0xc(%rsp)
8: 48 8d 7c 24 0c lea 0xc(%rsp),%rdi
d: e8 00 00 00 00 callq 12 <fun+0x12>
12: 48 83 c4 18 add $0x18,%rsp
16: 83 c0 07 add $0x7,%eax
19: c3 retq
and there you go edi used as in your case.
two pointers
extern unsigned int more_fun ( unsigned int *, unsigned int * );
unsigned int fun ( unsigned int x, unsigned int y )
{
return(more_fun(&x,&y)+7);
}
0000000000000000 <fun>:
0: 48 83 ec 18 sub $0x18,%rsp
4: 89 7c 24 0c mov %edi,0xc(%rsp)
8: 89 74 24 08 mov %esi,0x8(%rsp)
c: 48 8d 7c 24 0c lea 0xc(%rsp),%rdi
11: 48 8d 74 24 08 lea 0x8(%rsp),%rsi
16: e8 00 00 00 00 callq 1b <fun+0x1b>
1b: 48 83 c4 18 add $0x18,%rsp
1f: 83 c0 07 add $0x7,%eax
22: c3 retq
now edi and esi are used. all looking like it is the calling convention to me...
a string
extern unsigned int more_fun ( const char * );
unsigned int fun ( void )
{
return(more_fun("Hello World")+7);
}
0000000000000000 <fun>:
0: 48 83 ec 08 sub $0x8,%rsp
4: bf 00 00 00 00 mov $0x0,%edi
9: e8 00 00 00 00 callq e <fun+0xe>
e: 48 83 c4 08 add $0x8,%rsp
12: 83 c0 07 add $0x7,%eax
15: c3 retq
eax is not prepped as in printf, so perhaps eax has something to do with the number of parameters that follow, try putting more parameters on your printf and see if eax going in changes.
if I add -m32 on my command line then edi is not used.
00000000 <fun>:
0: 83 ec 18 sub $0x18,%esp
3: 68 00 00 00 00 push $0x0
8: e8 fc ff ff ff call 9 <fun+0x9>
d: 83 c4 1c add $0x1c,%esp
10: 83 c0 07 add $0x7,%eax
13: c3
I suspect the push is a placeholder for the linker to push the address to the string when the linker patches up the binary, this was just an object. So my guess is when you have a 64 bit pointer, the first one or two go into registers then the stack is used after it runs out of registers.
Obviously the compiler works so this is conforming to the compilers calling convention.
extern unsigned int more_fun ( unsigned int );
unsigned int fun ( unsigned int x )
{
return(more_fun(x+5)+7);
}
0000000000000000 <fun>:
0: 48 83 ec 08 sub $0x8,%rsp
4: 83 c7 05 add $0x5,%edi
7: e8 00 00 00 00 callq c <fun+0xc>
c: 48 83 c4 08 add $0x8,%rsp
10: 83 c0 07 add $0x7,%eax
13: c3 retq
correction based on Peter's comment. Yeah it does appear that registers are being used here.
And since he mentioned 6 parameters, lets try 7.
extern unsigned int more_fun
(
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int
);
unsigned int fun (
unsigned int a,
unsigned int b,
unsigned int c,
unsigned int d,
unsigned int e,
unsigned int f,
unsigned int g
)
{
return(more_fun(a+1,b+2,c+3,d+4,e+5,f+6,g+7)+17);
}
0000000000000000 <fun>:
0: 48 83 ec 10 sub $0x10,%rsp
4: 83 c1 04 add $0x4,%ecx
7: 83 c2 03 add $0x3,%edx
a: 8b 44 24 18 mov 0x18(%rsp),%eax
e: 83 c6 02 add $0x2,%esi
11: 83 c7 01 add $0x1,%edi
14: 41 83 c1 06 add $0x6,%r9d
18: 41 83 c0 05 add $0x5,%r8d
1c: 83 c0 07 add $0x7,%eax
1f: 50 push %rax
20: e8 00 00 00 00 callq 25 <fun+0x25>
25: 48 83 c4 18 add $0x18,%rsp
29: 83 c0 11 add $0x11,%eax
2c: c3 retq
and sure enough that 7th parameter was pulled from the stack modified and put back on the stack before the call. The other 6 in registers.

Linux syscall in vmlinux and virtual memory

I have find the sys_open code from vmlinux binary:
c1143c20: 55 push ebp
c1143c21: 89 e5 mov ebp,esp
c1143c23: 83 ec 10 sub esp,0x10
c1143c26: 89 5d f4 mov DWORD PTR [ebp-0xc],ebx
c1143c29: 89 75 f8 mov DWORD PTR [ebp-0x8],esi
c1143c2c: 89 7d fc mov DWORD PTR [ebp-0x4],edi
**c1143c2f: e8 74 bb 46 00 call 0xc15af7a8**
c1143c34: b8 9c ff ff ff mov eax,0xffffff9c
c1143c39: 8b 7d 08 mov edi,DWORD PTR [ebp+0x8]
c1143c3c: 8b 75 0c mov esi,DWORD PTR [ebp+0xc]
c1143c3f: 8b 5d 10 mov ebx,DWORD PTR [ebp+0x10]
c1143c42: 89 fa mov edx,edi
c1143c44: 89 f1 mov ecx,esi
c1143c46: 89 1c 24 mov DWORD PTR [esp],ebx
c1143c49: e8 e2 fd ff ff call 0xc1143a30 // same as above here
c1143c4e: 8b 5d f4 mov ebx,DWORD PTR [ebp-0xc]
c1143c51: 8b 75 f8 mov esi,DWORD PTR [ebp-0x8]
c1143c54: 8b 7d fc mov edi,DWORD PTR [ebp-0x4]
c1143c57: 89 ec mov esp,ebp
c1143c59: 5d pop ebp
c1143c5a: c3 ret
c1143c5b: 90 nop
and from the virtual memory:
.data:0x00000000 55 push ebp
.data:0x00000001 89e5 mov ebp,esp
.data:0x00000003 83ec10 sub esp,0x10
.data:0x00000006 895df4 mov DWORD PTR [ebp-0xc],ebx
.data:0x00000009 8975f8 mov DWORD PTR [ebp-0x8],esi
.data:0x0000000c 897dfc mov DWORD PTR [ebp-0x4],edi
**.data:0x0000000f 3e8d742600 lea esi,ds:[esi+eiz*1+0x0] **
**.data:0x00000014 b89cffffff mov eax,0xffffff9c**
.data:0x00000019 8b7d08 mov edi,DWORD PTR [ebp+0x8]
.data:0x0000001c 8b750c mov esi,DWORD PTR [ebp+0xc]
.data:0x0000001f 8b5d10 mov ebx,DWORD PTR [ebp+0x10]
.data:0x00000022 89fa mov edx,edi
.data:0x00000024 89f1 mov ecx,esi
.data:0x00000026 891c24 mov DWORD PTR [esp],ebx
.data:0x00000029 e8e2fdffff call func_fffffe10 // same
.data:0x0000002e 8b5df4 mov ebx,DWORD PTR [ebp-0xc]
.data:0x00000031 8b75f8 mov esi,DWORD PTR [ebp-0x8]
.data:0x00000034 8b7dfc mov edi,DWORD PTR [ebp-0x4]
.data:0x00000037 89ec mov esp,ebp
.data:0x00000039 5d pop ebp
.data:0x0000003a c3 ret
I don't understand why e8 74 bb 46 00 become 3e 8d 74 26 00 when loaded in memory. The adress at 0xc15af7a8 is a simple ret.
c15af7a8: c3 ret
0xc15af7a8 is called 26500 times in the vmlinux file. Why we call a simple ret instruction ?
My kernel is 3.2.0-23, with a default configuration. (no KASLR)

The useless ret is a stub that is replaced by the right code once the memory has been mapped.
The code of system calls maybe located to different places depending on some non-deterministic choices and once the memory address is known, the stub is replaced.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Faster assembly optimized way to convert between 8-bit grayscale and RGB32 image with SSE - visual-c++

Related

Slow SIMD performance - no inlining

Why does this SIMD code run slower than scalar equivalent?

NASM Segmentation fault

Assembly - Passing parameters to a function call

Linux syscall in vmlinux and virtual memory

Categories

Resources