Consider following examples for calculating sum of i32 array:
Example1: Simple for loop
pub fn vec_sum_for_loop_i32(src: &[i32]) -> i32 {
let mut sum = 0;
for c in src {
sum += *c;
}
sum
}
Example2: Explicit SIMD sum:
use std::arch::x86_64::*;
// #[inline]
pub fn vec_sum_simd_direct_loop(src: &[i32]) -> i32 {
#[cfg(debug_assertions)]
assert!(src.as_ptr() as u64 % 64 == 0);
#[cfg(debug_assertions)]
assert!(src.len() % (std::mem::size_of::<__m256i>() / std::mem::size_of::<i32>()) == 0);
let p_src = src.as_ptr();
let batch_size = std::mem::size_of::<__m256i>() / std::mem::size_of::<i32>();
#[cfg(debug_assertions)]
assert!(src.len() % batch_size == 0);
let result: i32;
unsafe {
let mut offset: isize = 0;
let total: isize = src.len() as isize;
let mut curr_sum = _mm256_setzero_si256();
while offset < total {
let curr = _mm256_load_epi32(p_src.offset(offset));
curr_sum = _mm256_add_epi32(curr_sum, curr);
offset += 8;
}
// this can be reduced with hadd.
let a0 = _mm256_extract_epi32::<0>(curr_sum);
let a1 = _mm256_extract_epi32::<1>(curr_sum);
let a2 = _mm256_extract_epi32::<2>(curr_sum);
let a3 = _mm256_extract_epi32::<3>(curr_sum);
let a4 = _mm256_extract_epi32::<4>(curr_sum);
let a5 = _mm256_extract_epi32::<5>(curr_sum);
let a6 = _mm256_extract_epi32::<6>(curr_sum);
let a7 = _mm256_extract_epi32::<7>(curr_sum);
result = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7;
}
result
}
When I tried to benchmark the code the first example got ~23GB/s (which is close to theoretical maximum for my RAM speed). Second example got 8GB/s.
When looking at the assembly with cargo asm first example translates into unrolled SIMD optimized loops:
.LBB11_7:
sum += *c;
movdqu xmm2, xmmword, ptr, [rcx, +, 4*rax]
paddd xmm2, xmm0
movdqu xmm0, xmmword, ptr, [rcx, +, 4*rax, +, 16]
paddd xmm0, xmm1
movdqu xmm1, xmmword, ptr, [rcx, +, 4*rax, +, 32]
movdqu xmm3, xmmword, ptr, [rcx, +, 4*rax, +, 48]
movdqu xmm4, xmmword, ptr, [rcx, +, 4*rax, +, 64]
paddd xmm4, xmm1
paddd xmm4, xmm2
movdqu xmm2, xmmword, ptr, [rcx, +, 4*rax, +, 80]
paddd xmm2, xmm3
paddd xmm2, xmm0
movdqu xmm0, xmmword, ptr, [rcx, +, 4*rax, +, 96]
paddd xmm0, xmm4
movdqu xmm1, xmmword, ptr, [rcx, +, 4*rax, +, 112]
paddd xmm1, xmm2
add rax, 32
add r11, -4
jne .LBB11_7
.LBB11_8:
test r10, r10
je .LBB11_11
lea r11, [rcx, +, 4*rax]
add r11, 16
shl r10, 5
xor eax, eax
Second example doesn't have any loop unrolling and doesn't even inline code to _mm256_add_epi32:
...
movaps xmmword, ptr, [rbp, +, 320], xmm7
movaps xmmword, ptr, [rbp, +, 304], xmm6
and rsp, -32
mov r12, rdx
mov rdi, rcx
lea rcx, [rsp, +, 32]
let mut curr_sum = _mm256_setzero_si256();
call core::core_arch::x86::avx::_mm256_setzero_si256
movaps xmm6, xmmword, ptr, [rsp, +, 32]
movaps xmm7, xmmword, ptr, [rsp, +, 48]
while offset < total {
test r12, r12
jle .LBB13_3
xor esi, esi
lea rbx, [rsp, +, 384]
lea r14, [rsp, +, 64]
lea r15, [rsp, +, 96]
.LBB13_2:
let curr = _mm256_load_epi32(p_src.offset(offset));
mov rcx, rbx
mov rdx, rdi
call core::core_arch::x86::avx512f::_mm256_load_epi32
curr_sum = _mm256_add_epi32(curr_sum, curr);
movaps xmmword, ptr, [rsp, +, 112], xmm7
movaps xmmword, ptr, [rsp, +, 96], xmm6
mov rcx, r14
mov rdx, r15
mov r8, rbx
call core::core_arch::x86::avx2::_mm256_add_epi32
movaps xmm6, xmmword, ptr, [rsp, +, 64]
movaps xmm7, xmmword, ptr, [rsp, +, 80]
offset += 8;
add rsi, 8
while offset < total {
add rdi, 32
cmp rsi, r12
...
This of course is pretty trivial example and I don't plan to use hand crafted SIMD for simple sum. But it still puzzles me on why explicit SIMD is so slow and why using SIMD intrinsics led to such unoptimized code.
It appears you forgot to tell rustc it was allowed to use AVX2 instructions everywhere, so it couldn't inline those functions. Instead, you get a total disaster where only the wrapper functions are compiled as AVX2-using functions, or something like that.
Works fine for me with -O -C target-cpu=skylake-avx512 (https://godbolt.org/z/csY5or43T) so it can inline even the AVX512VL load you used, _mm256_load_epi321, and then optimize it into a memory source operand for vpaddd ymm0, ymm0, ymmword ptr [rdi + 4*rax] (AVX2) inside a tight loop.
In GCC / clang, you get an error like "inlining failed in call to always_inline foobar" in this case, instead of working but slow asm. (See this for details). This is something Rust should probably sort out before this is ready for prime time, either be like MSVC and actually inline the instruction into a function using the intrinsic, or refuse to compile like GCC/clang.
Footnote 1:
See How to emulate _mm256_loadu_epi32 with gcc or clang? if you didn't mean to use AVX512.
With -O -C target-cpu=skylake (just AVX2), it inlines everything else, including vpaddd ymm, but still calls out to a function that copies 32 bytes from memory to memory with AVX vmovaps. It requires AVX512VL to inline the intrinsic, but later in the optimization process it realizes that with no masking, it's just a 256-bit load it should do without a bloated AVX-512 instruction. It's kinda dumb that Intel even provided a no-masking version of _mm256_mask[z]_loadu_epi32 that requires AVX-512. Or dumb that gcc/clang/rustc consider it an AVX512 intrinsic.
Related
I have tried my best to get the vectorclass library to generate AVX2 instructions, but can't get it to.
I'm using MSVC2019. Here are the compile options:
/permissive- /ifcOutput "x64\Release" /GS /Qpar /GL /W3 /Gy /Zc:wchar_t /I"D:\Tools\vectorclass" /I"D:\Tools\libzmq/include" /I"D:\Tools\boost\boost_1_79_0" /Zi /Gm- /O2 /Ob2 /sdl /Fd"x64\Release\vc142.pdb" /Zc:inline /D "__AVX2__" /D "ZMQ_STATIC" /D "FILE_INPUT" /D "NDEBUG" /D "WIN32" /D "_CRT_SECURE_NO_WARNINGS" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /std:c17 /arch:AVX2 /Gd /Oi /MT /std:c++17 /FC /Fa"x64\Release" /EHsc /nologo /Fo"x64\Release" /Ot /Fp"x64\Release\RtnLink_MSVC.pch" /diagnostics:column
In addition I've tried to force it with macro definitions AVX2 and INSTRSET but no luck.
#define INSTRSET (8)
#define __AVX2__
#pragma warning(disable : 4984) //warning C4984: 'if constexpr' is a C++17 language extension
#include "vectorclass.h"
size_t test(size_t size) {
Vec8ui incr(8);
Vec8ui accum(0, 1, 2, 3, 4, 5, 6, 7);
for (size_t i = 8; i < size; i += 8) {
accum = accum + accum;
}
size_t result = horizontal_max(accum);
const __m256i incr2 = _mm256_set1_epi32(8);
__m256i accum2 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
for (size_t i = 8; i < size; i += 8) {
accum2 = _mm256_add_epi32(accum2, incr2);
}
__declspec(align(32)) int32_t values_array[8];
_mm256_store_si256((__m256i*)values_array, accum2);
size_t result2 = values_array[0];
for (int i = 1; i < 8; i++) {
if (values_array[i] > result2) {
result2 = values_array[i];
}
}
return result;
}
This compiles to the following 2 loops:
Using vectorclass (no AVX2 instructions):
Vec8ui incr(8);
00007FF7A9BC2E5A mov edx,8
00007FF7A9BC2E5F lea rcx,[incr]
00007FF7A9BC2E63 call Vec8ui::Vec8ui (07FF7A9B58BFEh)
Vec8ui accum(0, 1, 2, 3, 4, 5, 6, 7);
00007FF7A9BC2E68 mov dword ptr [rsp+40h],7
00007FF7A9BC2E70 mov dword ptr [rsp+38h],6
00007FF7A9BC2E78 mov dword ptr [rsp+30h],5
00007FF7A9BC2E80 mov dword ptr [rsp+28h],4
00007FF7A9BC2E88 mov dword ptr [rsp+20h],3
00007FF7A9BC2E90 mov r9d,2
00007FF7A9BC2E96 mov r8d,1
00007FF7A9BC2E9C xor edx,edx
00007FF7A9BC2E9E lea rcx,[accum]
00007FF7A9BC2EA2 call Vec8ui::Vec8ui (07FF7A9B54B99h)
for (size_t i = 8; i < size; i += 8) {
00007FF7A9BC2EA7 mov qword ptr [rbp+98h],8
00007FF7A9BC2EB2 jmp __$EncStackInitStart+0A2h (07FF7A9BC2EC6h)
00007FF7A9BC2EB4 mov rax,qword ptr [rbp+98h]
00007FF7A9BC2EBB add rax,8
00007FF7A9BC2EBF mov qword ptr [rbp+98h],rax
00007FF7A9BC2EC6 mov rax,qword ptr [size]
00007FF7A9BC2ECD cmp qword ptr [rbp+98h],rax
00007FF7A9BC2ED4 jae __$EncStackInitStart+10Fh (07FF7A9BC2F33h)
accum = accum + accum;
00007FF7A9BC2ED6 lea rax,[rbp+4C0h]
00007FF7A9BC2EDD lea rcx,[accum]
00007FF7A9BC2EE1 mov rdi,rax
00007FF7A9BC2EE4 mov rsi,rcx
00007FF7A9BC2EE7 mov ecx,20h
00007FF7A9BC2EEC rep movs byte ptr [rdi],byte ptr [rsi]
00007FF7A9BC2EEE lea rax,[rbp+480h]
00007FF7A9BC2EF5 lea rcx,[accum]
00007FF7A9BC2EF9 mov rdi,rax
00007FF7A9BC2EFC mov rsi,rcx
00007FF7A9BC2EFF mov ecx,20h
00007FF7A9BC2F04 rep movs byte ptr [rdi],byte ptr [rsi]
00007FF7A9BC2F06 lea r8,[rbp+4C0h]
00007FF7A9BC2F0D lea rdx,[rbp+480h]
00007FF7A9BC2F14 lea rcx,[rbp+380h]
00007FF7A9BC2F1B call operator+ (07FF7A9BC29C0h)
00007FF7A9BC2F20 lea rcx,[accum]
00007FF7A9BC2F24 mov rdi,rcx
00007FF7A9BC2F27 mov rsi,rax
00007FF7A9BC2F2A mov ecx,20h
00007FF7A9BC2F2F rep movs byte ptr [rdi],byte ptr [rsi]
}
00007FF7A9BC2F31 jmp __$EncStackInitStart+90h (07FF7A9BC2EB4h)
size_t result = horizontal_max(accum);
00007FF7A9BC2F33 lea rax,[rbp+500h]
00007FF7A9BC2F3A lea rcx,[accum]
00007FF7A9BC2F3E mov rdi,rax
00007FF7A9BC2F41 mov rsi,rcx
00007FF7A9BC2F44 mov ecx,20h
00007FF7A9BC2F49 rep movs byte ptr [rdi],byte ptr [rsi]
00007FF7A9BC2F4B lea rcx,[rbp+500h]
00007FF7A9BC2F52 call horizontal_max<Vec8ui> (07FF7A9B54FB3h)
00007FF7A9BC2F57 mov eax,eax
00007FF7A9BC2F59 mov qword ptr [result],rax
Using intrinsics (we get AVX2 instructions):
const __m256i incr2 = _mm256_set1_epi32(8);
00007FF7A9BC2F60 vmovdqu ymm0,ymmword ptr [__ymm#0000000800000008000000080000000800000008000000080000000800000008 (07FF7A9E87940h)]
00007FF7A9BC2F68 vmovdqu ymmword ptr [rbp+3C0h],ymm0
00007FF7A9BC2F70 vmovdqu ymm0,ymmword ptr [rbp+3C0h]
00007FF7A9BC2F78 vmovdqu ymmword ptr [incr2],ymm0
__m256i accum2 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
00007FF7A9BC2F80 vmovdqu ymm0,ymmword ptr [__ymm#0000000700000006000000050000000400000003000000020000000100000000 (07FF7A9E87900h)]
00007FF7A9BC2F88 vmovdqu ymmword ptr [rbp+400h],ymm0
00007FF7A9BC2F90 vmovdqu ymm0,ymmword ptr [rbp+400h]
00007FF7A9BC2F98 vmovdqu ymmword ptr [accum2],ymm0
for (size_t i = 8; i < size; i += 8) {
00007FF7A9BC2FA0 mov qword ptr [rbp+158h],8
00007FF7A9BC2FAB jmp __$EncStackInitStart+19Bh (07FF7A9BC2FBFh)
00007FF7A9BC2FAD mov rax,qword ptr [rbp+158h]
00007FF7A9BC2FB4 add rax,8
00007FF7A9BC2FB8 mov qword ptr [rbp+158h],rax
00007FF7A9BC2FBF mov rax,qword ptr [size]
00007FF7A9BC2FC6 cmp qword ptr [rbp+158h],rax
00007FF7A9BC2FCD jae __$EncStackInitStart+1D5h (07FF7A9BC2FF9h)
accum2 = _mm256_add_epi32(accum2, incr2);
00007FF7A9BC2FCF vmovdqu ymm0,ymmword ptr [accum2]
00007FF7A9BC2FD7 vpaddd ymm0,ymm0,ymmword ptr [incr2]
00007FF7A9BC2FDF vmovdqu ymmword ptr [rbp+440h],ymm0
00007FF7A9BC2FE7 vmovdqu ymm0,ymmword ptr [rbp+440h]
00007FF7A9BC2FEF vmovdqu ymmword ptr [accum2],ymm0
}
00007FF7A9BC2FF7 jmp __$EncStackInitStart+189h (07FF7A9BC2FADh)
__declspec(align(32)) int32_t values_array[8];
_mm256_store_si256((__m256i*)values_array, accum2);
00007FF7A9BC2FF9 vmovdqu ymm0,ymmword ptr [accum2]
00007FF7A9BC3001 vmovdqa ymmword ptr [values_array],ymm0
size_t result2 = values_array[0];
00007FF7A9BC3009 mov eax,4
00007FF7A9BC300E imul rax,rax,0
00007FF7A9BC3012 movsxd rax,dword ptr values_array[rax]
00007FF7A9BC301A mov qword ptr [result2],rax
for (int i = 1; i < 8; i++) {
00007FF7A9BC3021 mov dword ptr [rbp+1D4h],1
00007FF7A9BC302B jmp __$EncStackInitStart+217h (07FF7A9BC303Bh)
00007FF7A9BC302D mov eax,dword ptr [rbp+1D4h]
00007FF7A9BC3033 inc eax
00007FF7A9BC3035 mov dword ptr [rbp+1D4h],eax
00007FF7A9BC303B cmp dword ptr [rbp+1D4h],8
00007FF7A9BC3042 jge __$EncStackInitStart+250h (07FF7A9BC3074h)
if (values_array[i] > result2) {
00007FF7A9BC3044 movsxd rax,dword ptr [rbp+1D4h]
00007FF7A9BC304B movsxd rax,dword ptr values_array[rax*4]
00007FF7A9BC3053 cmp rax,qword ptr [result2]
00007FF7A9BC305A jbe __$EncStackInitStart+24Eh (07FF7A9BC3072h)
result2 = values_array[i];
00007FF7A9BC305C movsxd rax,dword ptr [rbp+1D4h]
00007FF7A9BC3063 movsxd rax,dword ptr values_array[rax*4]
00007FF7A9BC306B mov qword ptr [result2],rax
This question already has answers here:
After entering _start, is rsp aligned?
(1 answer)
Should %rsp be aligned to 16-byte boundary before calling a function in NASM?
(1 answer)
Why does the x86-64 / AMD64 System V ABI mandate a 16 byte stack alignment?
(1 answer)
Closed 9 months ago.
I have the following code snippet (https://godbolt.org/z/cE1qE9fvv) which contains a naive & vectorized version of a dot product.
I decided to make the vectorized version compile in standalone asm file as following:
extern exit
section .text
global _start
_start:
mov rax, 8589934593
mov QWORD [rsp-72], rax
mov rax, 17179869187
mov QWORD [rsp-64], rax
mov rax, 25769803781
mov QWORD [rsp-56], rax
mov rax, 34359738375
mov QWORD [rsp-48], rax
mov rax, 85899345930
mov QWORD [rsp-40], rax
mov rax, 171798691870
mov QWORD [rsp-32], rax
mov rax, 257698037810
mov QWORD [rsp-24], rax
mov rax, 343597383750
mov QWORD [rsp-16], rax
movdqa xmm1, [rsp-72]
movdqa xmm0, [rsp-24]
pmulld xmm1, [rsp-40]
pmulld xmm0, [rsp-56]
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
.exit:
call exit
I use the following to build: nasm -f elf64 dot_product.asm && gcc -g -no-pie -nostartfiles -o dot_product dot_product.o
The above code segfault at movdqa xmm0, XMMWORD PTR [rsp-72] which probably means that the data is not 16-bytes aligned. However, the following screenshot seems to indicate the opposite:
Am I misunderstanding something ?
This is one of those n00b questions where I'm doing something wrong but I don't fully understand yet.
The xxhash32 algorithm has a nice 16 byte inner loop that can be made faster with SIMD, so, as an exercise to myself, this is what I'm trying to do.
The body of the loop looks like this (numBytes is some multiple of 16):
// C# that gets auto-vectorized. uint4 is a vector of 4 elements
uint4 state = new uint4(Prime1 + Prime2, Prime2, 0, (uint)-Prime1) + seed;
int count = numBytes >> 4;
for (int i = 0; i < count; ++i) {
state += *p++ * Prime2;
state = (state << 13) | (state >> 19);
state *= Prime1;
}
hash = rol(state.x, 1) + rol(state.y, 7) + rol(state.z, 12) + rol(state.w, 18);
I've translated this into the following SSE2/SSE4.1 intrinsics:
auto prime1 = _mm_set1_epi32(kPrime1);
auto prime2 = _mm_set1_epi32(kPrime2);
auto state = _mm_set_epi32(seed + kPrime1 + kPrime2, seed + kPrime2, seed, seed - kPrime1);
int32_t count = size >> 4; // =/16
for (int32_t i = 0; i < count; i++) {
state = _mm_add_epi32(state, _mm_mullo_epi32(_mm_loadu_si128(p128++), prime2));
state = _mm_or_si128(_mm_sll_epi32(state, _mm_cvtsi32_si128(13)), _mm_srl_epi32(state, _mm_cvtsi32_si128(19)));
state = _mm_mullo_epi32(state, prime1);
}
uint32_t temp[4];
_mm_storeu_si128(state, temp);
hash = _lrotl(temp[0], 1) + _lrotl(temp[1], 7) + _lrotl(temp[2], 12) + _lrotl(temp[3], 18);
Here's the disassembly of the inner loop body:
mov rax,qword ptr [p128]
mov qword ptr [rsp+88h],rax
mov rax,qword ptr [rsp+88h]
movdqu xmm0,xmmword ptr [rax]
movdqa xmmword ptr [rsp+90h],xmm0
movdqa xmm0,xmmword ptr [rsp+90h]
movdqa xmmword ptr [rsp+120h],xmm0
mov rax,qword ptr [p128]
add rax,10h
mov qword ptr [p128],rax
movdqa xmm0,xmmword ptr [prime2]
movdqa xmmword ptr [rsp+140h],xmm0
movdqa xmm0,xmmword ptr [rsp+120h]
movdqa xmmword ptr [rsp+130h],xmm0
movdqa xmm0,xmmword ptr [rsp+130h]
pmulld xmm0,xmmword ptr [rsp+140h]
movdqa xmmword ptr [rsp+150h],xmm0
movdqa xmm0,xmmword ptr [rsp+150h]
movdqa xmmword ptr [rsp+160h],xmm0
movdqa xmm0,xmmword ptr [rsp+160h]
movdqa xmmword ptr [rsp+170h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+100h],xmm0
movdqa xmm0,xmmword ptr [rsp+100h]
paddd xmm0,xmmword ptr [rsp+170h]
movdqa xmmword ptr [rsp+180h],xmm0
movdqa xmm0,xmmword ptr [rsp+180h]
movdqa xmmword ptr [rsp+190h],xmm0
movdqa xmm0,xmmword ptr [rsp+190h]
movdqa xmmword ptr [rsp+20h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+1A0h],xmm0
mov eax,13h
movd xmm0,eax
movdqa xmmword ptr [rsp+1B0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1A0h]
psrld xmm0,xmmword ptr [rsp+1B0h]
movdqa xmmword ptr [rsp+1C0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1C0h]
movdqa xmmword ptr [rsp+200h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+1D0h],xmm0
mov eax,0Dh
movd xmm0,eax
movdqa xmmword ptr [rsp+1E0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1D0h]
pslld xmm0,xmmword ptr [rsp+1E0h]
movdqa xmmword ptr [rsp+1F0h],xmm0
movdqa xmm0,xmmword ptr [rsp+1F0h]
movdqa xmmword ptr [rsp+210h],xmm0
movdqa xmm0,xmmword ptr [rsp+200h]
movdqa xmmword ptr [rsp+230h],xmm0
movdqa xmm0,xmmword ptr [rsp+210h]
movdqa xmmword ptr [rsp+220h],xmm0
movdqa xmm0,xmmword ptr [rsp+220h]
por xmm0,xmmword ptr [rsp+230h]
movdqa xmmword ptr [rsp+240h],xmm0
movdqa xmm0,xmmword ptr [rsp+240h]
movdqa xmmword ptr [rsp+250h],xmm0
movdqa xmm0,xmmword ptr [rsp+250h]
movdqa xmmword ptr [rsp+20h],xmm0
movdqa xmm0,xmmword ptr [prime1]
movdqa xmmword ptr [rsp+280h],xmm0
movdqa xmm0,xmmword ptr [rsp+20h]
movdqa xmmword ptr [rsp+270h],xmm0
movdqa xmm0,xmmword ptr [rsp+270h]
pmulld xmm0,xmmword ptr [rsp+280h]
movdqa xmmword ptr [rsp+290h],xmm0
movdqa xmm0,xmmword ptr [rsp+290h]
movdqa xmmword ptr [rsp+2A0h],xmm0
movdqa xmm0,xmmword ptr [rsp+2A0h]
movdqa xmmword ptr [rsp+20h],xmm0
Some questions about the disassembly:
Why so many movdqa instructions (I thought the purpose of intrinsics was that they mapped to specific hardware instructions.)?
Why is only xmm0 used, it looks to me like it is shuffling memory in and out of the vector pipeline (I'm expecting more xmmN registers to be used)?
This is compiled with Visual C++ 2017, I haven't enabled additional optimizations.
When I run these two snippets over a block of 64 MiB, many times over, the scalar code is about 3 timers faster. This is not what I expect to happen, what have I missed?
Okay, this has everything to do with compiler optimization flags and is totally Visual C++ specific.
As I enable additional compiler optimization switches the code gets so much faster.
The inner loop turns into this:
pmulld xmm0,xmm5
paddd xmm0,xmm3
movdqa xmm3,xmm0
pslld xmm3,xmm2
psrld xmm0,xmm1
por xmm3,xmm0
pmulld xmm3,xmm4
While the documentation says that /Ox is equivalent to some other switches, it wasn't until I actually compiled with /Ox or /O2 that the code ended up looking like that.
Edit: the SIMD result ended up being just 8% faster. The xxhash32 algorithm is very good superscalar code so while I expected more, this is what I got. There's some notes about this in the original source.
Some numbers from my computer (Ryzen 1700).
memcpy 11.334895 GiB/s
SIMD 5.737743 GiB/s
Scalar 5.286924 GiB/s
I was hoping to try and make the xxhash32 algorithm almost as fast as memcpy. I've seen some benchmarks that suggest that this could be improved but it's difficult to compare without having a comparable baseline, that's why I bench against my computers memcpy performance.
Does anyone know whether Linux supports the use of the x87-FPU in 64-bit-mode, i.e. if the instructions are not trapped and the registers are saved on context-switch. I don't want to use it and I know SSE is the standard in x64-mode; it's just for curiosity.
Yes. It is supported. The D language uses that feature. When you use the types float or double it compiles SSE code, when you use real which tells the compiler to use the implementations most precise type. In case of x86 it is x87 with its 80 bit type.
https://godbolt.org/z/50kr-H
real square(real num) {
return num * num;
}
float square(float num) {
return num * num;
}
compiles to
real example.square(real):
push rbp
mov rbp, rsp
fld tbyte ptr [rbp + 16]
fstp tbyte ptr [rbp - 16]
fld tbyte ptr [rbp - 16]
fmul st(0), st
pop rbp
ret
float example.square(float):
push rbp
mov rbp, rsp
movss dword ptr [rbp - 4], xmm0
movss xmm0, dword ptr [rbp - 4]
mulss xmm0, dword ptr [rbp - 4]
pop rbp
ret
I'm trying to find an optimized method for RGB8 (actually grayscale) to RGB32 image conversion.
Source is an 8 bits grey image, Destination should be an 32 bits grey image (BGRA) with 4th channel (alpha) to be ignored. Source address is not guaranteed to be 16 byte aligned, Count is a multiple of 16, Destination address is 16 byte aligned.
INPUT: 8 bits single channel grey image
OUTPUT: 32 bits BGRA (alpha channel ignored)
COUNT: Image size is a multiple of 16
CPU: x86-32 (SSE2/SSE3 allowed)
Here is my optimized assembly code. Is there an even faster way of conversion?
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
__asm {
mov esi, Source
mov edi, Destination
mov edx, Count
xor ecx, ecx
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
movdqu xmm0, xmmword ptr [esi + ecx]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
pshufb xmm1, xmm5
pshufb xmm2, xmm6
pshufb xmm3, xmm7
movntdq [edi + 0], xmm0
movntdq [edi + 16], xmm1
movntdq [edi + 32], xmm2
movntdq [edi + 48], xmm3
add edi, 64
add ecx, 16
cmp ecx, edx
jb l1
}
}
There is another approach using several PUNPCKLBW and PUNPCKHBW but that seems to be slightly slower.
Update: This is the basic non optimized algorithm:
BGRA* Destination = ...
unsigned char* Source ...
for (unsigned int i = 0; i < Size; i++) {
Destination[i].Blue = Source[i];
Destination[i].Green = Source[i];
Destination[i].Red = Source[i];
}
PS: I also tried using C code with MS VS2008 SSE compiler intrinsics. It turned out that the compiler generated a lot of unnecessary memory moves which causes the code to be 10-20% slower than pure assembly.
Update 2: This is the same code by using intrinsics only.
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 4));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 8));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 12));
for (unsigned int i = 0; i < Count / 16; i++) {
__m128i r0 = _mm_load_si128(Source + i);
_mm_stream_si128(Destination + (i * 4) + 0, _mm_shuffle_epi8(r0, m0));
_mm_stream_si128(Destination + (i * 4) + 1, _mm_shuffle_epi8(r0, m1));
_mm_stream_si128(Destination + (i * 4) + 2, _mm_shuffle_epi8(r0, m2));
_mm_stream_si128(Destination + (i * 4) + 3, _mm_shuffle_epi8(r0, m3));
}
}
Update 3: This is the compiler generated code (beautified) (Visual Studio 2012, all optimization on):
push ebp
mov ebp, esp
mov edx, dword ptr [ebp+8]
movdqa xmm1, xmmword ptr ds:[Masks + 0]
movdqa xmm2, xmmword ptr ds:[Masks + 16]
movdqa xmm3, xmmword ptr ds:[Masks + 32]
movdqa xmm4, xmmword ptr ds:[Masks + 48]
push esi
test ecx, ecx
je l2
lea esi, [ecx-1]
shr esi, 4
inc esi
l1:
mov ecx, edx
movdqu xmm0, xmmword ptr [ecx]
mov ecx, eax
movdqa xmm5, xmm0
pshufb xmm5, xmm1
movdqa xmmword ptr [ecx], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm2
movdqa xmmword ptr [eax+10h], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm3
movdqa xmmword ptr [eax+20h], xmm5
lea ecx, [eax+30h]
add edx, 10h
add eax, 40h
dec esi
pshufb xmm0, xmm4
movdqa xmmword ptr [ecx], xmm0
jne l1
l2:
pop esi
pop ebp
ret
It seems that interleaving movdqa with pshufb is some what faster.
Update 4: This seems to be the optimal hand optimized code:
__asm {
mov esi, Source
mov edi, Destination
mov ecx, Count
movdqu xmm0, xmmword ptr [esi]
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
dec ecx
lea edi, [ edi + 64 ]
lea esi, [ esi + 16 ]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
movdqa [edi - 64], xmm0
pshufb xmm1, xmm5
movdqa [edi - 48], xmm1
pshufb xmm2, xmm6
movdqa [edi - 32], xmm2
pshufb xmm3, xmm7
movdqa [edi - 16], xmm3
movdqu xmm0, xmmword ptr [esi]
ja l1
}
Update 5: This conversion algorithm uses punpck instruction. However this conversion routine is a bit slower than using masks and pushfb.
for (unsigned int i = 0; i < Count; i += 16) {
register __m128i r0 = _mm_load_si128(Source++);
register __m128i r1 = _mm_unpackhi_epi8(r0, r0);
register __m128i r2 = _mm_unpacklo_epi8(r0, r0);
register __m128i r3 = _mm_unpackhi_epi8(r1, r1);
register __m128i r4 = _mm_unpacklo_epi8(r1, r1);
register __m128i r5 = _mm_unpackhi_epi8(r2, r2);
register __m128i r6 = _mm_unpacklo_epi8(r2, r2);
_mm_store_si128(Destination++, r6);
_mm_store_si128(Destination++, r5);
_mm_store_si128(Destination++, r4);
_mm_store_si128(Destination++, r3);
}
Update 6: For the sake of completeness this is the inverse method to convert from 32 bits back to 8 bits grey image.
static void ConvertRgb32ToGrey(const __m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned char __declspec(align(64)) Masks[] = {
0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c,
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 16));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 32));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 48));
for (unsigned int i = 0; i < Count / 64; i++) {
__m128i a = _mm_load_si128(Source + (i * 4) + 0);
__m128i b = _mm_load_si128(Source + (i * 4) + 1);
__m128i c = _mm_load_si128(Source + (i * 4) + 2);
__m128i d = _mm_load_si128(Source + (i * 4) + 3);
a = _mm_shuffle_epi8(a, m0);
b = _mm_shuffle_epi8(b, m1);
c = _mm_shuffle_epi8(c, m2);
d = _mm_shuffle_epi8(d, m3);
__m128i e = _mm_or_si128(a, b);
__m128i f = _mm_or_si128(c, d);
__m128i g = _mm_or_si128(e, f);
_mm_stream_si128(Destination + i, g);
}
}
Would try:
__asm {
mov esi, Source
mov edi, Destination
mov ecx, Count
movdqu xmm0, xmmword ptr [esi]
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
dec ecx // modern Intel can macro-fuse this with jnz if adjacent
lea edi, [ edi + 64 ]
lea esi, [ esi + 16 ]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
pshufb xmm1, xmm5
pshufb xmm2, xmm6
pshufb xmm3, xmm7
movntdq [edi - 64], xmm0
movntdq [edi - 48], xmm1
movntdq [edi - 32], xmm2
movntdq [edi - 16], xmm3
movdqu xmm0, xmmword ptr [esi]
jnz l1
}
Haven't benchmarked it though; assumptions behind these changes:
the movdqu xmm0,... latency can be a little more hidden within the loop (your code has the load of xmm0 followed directly by an instruction using the value in that register)
the add ops on two regs as well as the cmp aren't really all necessary; address generation (lea) and the implicit zero test by dec/jnz can be used. That way, there'll be no EFLAGS dependencies caused by operations on ecx/esi/edi as the only ALU op in the loop is decrementing the loop counter.
In the end, this is likely load/store bound in any case so the arithmetics are "free game"; I therefore expect little difference, even with the arguments as given.
If the input is large, then it'd make sense to strip the "unaligned head/tail" off, i.e. to do a duff's device for the first/last [0..15] bytes, and the main loop using movdqa.
Edit:
Running your intrinsics sources through gcc -msse4.2 -O8 -c (GCC 4.7.1) gives the following assembly:
Disassembly of section .text:
0000000000000000 <ConvertGreyToRgb32Assembler>:
0: 85 d2 test edx,edx
2: 74 76 je 7a <ConvertGreyToRgb32Assembler+0x7a>
4: 66 0f 6f 2d 00 00 00 00 movdqa xmm5,XMMWORD PTR [rip+0x0]
# c <ConvertGreyToRgb32Assembler+0xc>
c: 48 89 f8 mov rax,rdi
f: 66 0f 6f 25 00 00 00 00 movdqa xmm4,XMMWORD PTR [rip+0x0]
# 17 <ConvertGreyToRgb32Assembler+0x17>
17: 66 0f 6f 1d 00 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0]
# 1f <ConvertGreyToRgb32Assembler+0x1f>
1f: 66 0f 6f 15 00 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
# 27 <ConvertGreyToRgb32Assembler+0x27>
27: 66 0f 1f 84 00 00 00 00 00 nop WORD PTR [rax+rax*1+0x0]
30: f3 0f 6f 00 movdqu xmm0,XMMWORD PTR [rax]
34: 48 89 f1 mov rcx,rsi
37: 48 83 c0 10 add rax,0x10
3b: 66 0f 6f c8 movdqa xmm1,xmm0
3f: 66 0f 38 00 cd pshufb xmm1,xmm5
44: 66 0f e7 0e movntdq XMMWORD PTR [rsi],xmm1
48: 66 0f 6f c8 movdqa xmm1,xmm0
4c: 66 0f 38 00 cc pshufb xmm1,xmm4
51: 66 0f e7 4e 10 movntdq XMMWORD PTR [rsi+0x10],xmm1
56: 66 0f 6f c8 movdqa xmm1,xmm0
5a: 66 0f 38 00 c2 pshufb xmm0,xmm2
5f: 66 0f 38 00 cb pshufb xmm1,xmm3
64: 66 0f e7 4e 20 movntdq XMMWORD PTR [rsi+0x20],xmm1
69: 66 0f e7 41 30 movntdq XMMWORD PTR [rcx+0x30],xmm0
6e: 89 c1 mov ecx,eax
70: 29 f9 sub ecx,edi
72: 48 83 c6 40 add rsi,0x40
76: 39 ca cmp edx,ecx
78: 77 b6 ja 30 <ConvertGreyToRgb32Assembler+0x30>
7a: f3 c3 repz ret
This reminds me extremely strongly of your initial assembly code. If MSVC creates something significantly worse than that, I'd say it's a bug/limitation in the compiler (version) you used.