Search over an array of 14 integers, build a mask and return the match on ARMv8a using NEON - linux

For my open source project cachegrand we are implementing AARCH64 support and although most of the port is completed we are sorting out a feature to perform an accelerated array search using NEON instructions.
The logic we use is pretty simple:
in input there is an array of 14 uint32 elements, the value to find and a mask to ignore certain matches
the code has to find any value that matches a specific uint32
build a bitmask
the least significant bits of the bitmask match the begin of the array
the bitmask is then & with the skip indices mask
and then the trailing zeros are counted to determine the index of the first occurance
It's a very rare occurance that the skip indices mask is actually used, I would say that 99.9% of the cases will be zero.
I have come up with the following implementation, but I have no experience with ARMv8 NEON instruction and feels a bit clunky, especially so I was wondering if there is a way to make it faster and/or better.
For reference, currently the code is compiled only with GCC.
uint8_t hashtable_mcmp_support_hash_search_armv8a_neon_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask) {
uint32x4_t tmp;
uint32_t compacted_result_mask = 0;
uint32_t skip_indexes_mask_inv = ~skip_indexes_mask;
static const int32x4_t shift = {0, 1, 2, 3};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
uint32x4_t ring_vector_0_3 = vld1q_u32((hashtable_hash_half_t*)hashes + 0);
uint32x4_t cmp_vector_0_3 = vceqq_u32(ring_vector_0_3, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_0_3, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 0;
uint32x4_t ring_vector_4_7 = vld1q_u32((hashtable_hash_half_t*)hashes + 4);
uint32x4_t cmp_vector_4_7 = vceqq_u32(ring_vector_4_7, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_4_7, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 4;
uint32x4_t ring_vector_8_11 = vld1q_u32((hashtable_hash_half_t*)hashes + 8);
uint32x4_t cmp_vector_8_11 = vceqq_u32(ring_vector_8_11, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_8_11, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 8;
uint32x4_t ring_vector_10_13 = vld1q_u32((hashtable_hash_half_t*)hashes + 10);
uint32x4_t cmp_vector_10_13 = vceqq_u32(ring_vector_10_13, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_10_13, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 10;
return __builtin_ctz(compacted_result_mask & skip_indexes_mask_inv);
}
Just for reference, here the AVX2 code
static inline uint8_t hashtable_mcmp_support_hash_search_avx2_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask) {
uint32_t compacted_result_mask = 0;
uint32_t skip_indexes_mask_inv = ~skip_indexes_mask;
__m256i cmp_vector = _mm256_set1_epi32(hash);
// The second load, load from the 6th uint32 to the 14th uint32, _mm256_loadu_si256 always loads 8 x uint32
for(uint8_t base_index = 0; base_index < 12; base_index += 6) {
__m256i ring_vector = _mm256_loadu_si256((__m256i*) (hashes + base_index));
__m256i result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
// Uses _mm256_movemask_ps to reduce the bandwidth
compacted_result_mask |= (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(result_mask_vector)) << (base_index);
}
return _tzcnt_u32(compacted_result_mask & skip_indexes_mask_inv);
}
On a side question, do you think it's worth to implement support for SVE2 instructions? Especially taking into account that this is a pretty simple operation and looks like there might not be mandatory support for 256 bits registers (which probably would be the biggest benefit of using SVE2 in this specific context)

Booleans don't need 32 bits each: shrink them to 8 bits ASAP by vuzp1 and vomovn prior to doing further operations.
uint8_t hashtable_mcmp_support_hash_search_armv8a_neon_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask)
{
uint16x8_t tmp16a, tmp16b;
uint8x8_t tmp8a, tmp8b;
uint32_t tmp;
static const uint8x8_t mask = {1, 2, 4, 8, 16, 32, 64, 128};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
uint32x4x3_t ring_vector_0_11 = vld1q_u32_x3((uint32_t *)hashes);
uint32x4_t ring_vector_10_13 = vld1q_u32((uint32_t *)hashes+10);
ring_vector_0_11.val[0] = vceqq_u32(ring_vector_0_11.val[0], cmp_vector);
ring_vector_0_11.val[1] = vceqq_u32(ring_vector_0_11.val[1], cmp_vector);
ring_vector_0_11.val[2] = vceqq_u32(ring_vector_0_11.val[2], cmp_vector);
ring_vector_10_13 = vceqq_u32(ring_vector_10_13, cmp_vector);
tmp16a = vuzp1q_u16(ring_vector_0_11.val[0], ring_vector_0_11.val[1]);
tmp16b = vuzp1q_u16(ring_vector_0_11.val[2], ring_vector_10_13);
tmp8a = vmovn_u16(tmp16a);
tmp8b = vmovn_u16(tmp16b);
tmp8a = vand_u8(tmp8a, mask);
tmp8b = vand_u8(tmp8b, mask);
tmp = (uint32_t)vaddv_u8(tmp8a) | (uint32_t)(vaddv_u8(tmp8b)<<8);
return __builtin_ctz(tmp &~ skip_indexes_mask);
}
And I don't think sve will bring a meaningful performance boost since the performance is more or less crippled at the end (vaddv and especially the transfer to arm registers)
If you are dealing with thousands of 14 entry arrays, you should consider redesigning your function to writing into an 8bit array instead of returning in arm register each and every time. That will eliminate the most time consuming pipeline hazard caused by the Neon to arm transfer.
#include <arm_neon.h>
#include <arm_acle.h>
void hashtable_mcmp_support_hash_search_armv8a_neon_14_b(
uint8_t *pDst,
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask, uint32_t number_of_arrays)
{
uint16x8_t tmp16a, tmp16b;
uint16x4_t tmp;
uint8x8_t tmp8a, tmp8b;
static const uint8x8_t mask = {128, 64, 32, 16, 8, 4, 2, 1};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
skip_indexes_mask = __rbit(skip_indexes_mask)>>16;
uint16x4_t index_mask = vdup_n_u16((uint16_t) skip_indexes_mask);
uint32x4x4_t ring_vector;
while (number_of_arrays--)
{
ring_vector = vld1q_u32_x4((uint32_t *)hashes);
hashes += 16;
ring_vector.val[0] = vceqq_u32(ring_vector.val[0], cmp_vector);
ring_vector.val[1] = vceqq_u32(ring_vector.val[1], cmp_vector);
ring_vector.val[2] = vceqq_u32(ring_vector.val[2], cmp_vector);
ring_vector.val[3] = vceqq_u32(ring_vector.val[3], cmp_vector);
tmp16a = vuzp1q_u16(vreinterpretq_u16_u32(ring_vector.val[0]), vreinterpretq_u16_u32(ring_vector.val[1]));
tmp16b = vuzp1q_u16(vreinterpretq_u16_u32(ring_vector.val[2]), vreinterpretq_u16_u32(ring_vector.val[3]));
tmp8a = vmovn_u16(tmp16a);
tmp8b = vmovn_u16(tmp16b);
tmp8a = vand_u8(tmp8a, mask);
tmp8b = vand_u8(tmp8b, mask);
tmp8a[1] = vaddv_u8(tmp8a);
tmp8a[0] = vaddv_u8(tmp8b);
tmp = vbic_u16(vreinterpret_u16_u8(tmp8a), index_mask);
tmp = vclz_u16(tmp);
vst1_lane_u8(pDst++,vreinterpret_u8_u16(tmp), 0);
}
}
Above is an "improved" version
It assumes the arrays to be in contiguous memory with 8 bytes padding which is perferrable for the cache efficiency unless the memory requirement is a problem.
Instead of returning an 8bit result, it writes the results into memory directly, avoiding pipeline hazards caused by neon to arm transfer.
It still suffers from vaddv latency(8 cycles). You can unroll the loop so that it processes 2 or even 4 arrays per iteration in order to hide that latency.

Related

How to make VC compiler optimize my code with SIMD better?

I'm now learning SIMD and thinking about how to let compiler optimize my code better. Now I'm playing with Visual C++ 2013 x86.
I have an array, I have another array, and I want to compute like this:
void computeSum(float* __restrict arr, float* __restrict inp1, float* __restrict inp2, int count)
{
__declspec(align(16)) float* p1 = inp1;
__declspec(align(16)) float* p2 = inp2;
__declspec(align(16)) float* ret = arr;
while (count > 0)
{
ret[0] = p1[0] + p2[0];
ret[1] = p1[1] + p2[1];
ret[2] = p1[2] + p2[2];
ret[3] = p1[3] + p2[3];
p1 += 4;
p2 += 4;
ret += 4;
count -= 4;
}
}
I want to tell the compiler that the arrays are aligned to 16-byte boundary and anyone is not overlay on another, and one loop will compute 4 continuous float number's summation.
But in generated code, VC prefer MOVSS/ADDSS and not use ADDPS which I hope it to.
If I configure the project to use LLVM-vs2013 tool chain, it use ADDPS to compute the summation.
I know how to use compiler intrinsics to write SIMD code, but that's not what I want.
Are there any more hints that VC needs to use ADDPS instruction?
This is the full piece of code.
#include <stdio.h>
#include <stdlib.h>
void computeSum(float* __restrict arr, float* __restrict inp1, float* __restrict inp2, int count)
{
__declspec(align(16)) float* p1 = inp1;
__declspec(align(16)) float* p2 = inp2;
__declspec(align(16)) float* ret = arr;
while (count > 0)
{
ret[0] = p1[0] + p2[0];
ret[1] = p1[1] + p2[1];
ret[2] = p1[2] + p2[2];
ret[3] = p1[3] + p2[3];
p1 += 4;
p2 += 4;
ret += 4;
count -= 4;
}
}
int main()
{
float* inp1 = (float*)_aligned_malloc(sizeof(float) * 128, 16);
float* inp2 = (float*)_aligned_malloc(sizeof(float) * 128, 16);
float* result = (float*)_aligned_malloc(sizeof(float) * 128, 16);
for (int i = 0; i < 128; ++i)
{
inp1[i] = inp2[i] = i;
}
computeSum(result, inp1, inp2, 128);
for (int i = 0; i < 128; ++i)
{
printf("%f\t", result[i]);
}
return 0;
}
Visual C++ 2013 or later will default to use /arch:SSE2 for x86, but you should still check the settings in your Visual Studio project to make sure it hasn't explicitly been set to something else. For x64, /arch:SSE2 is implicit.
The only time that Visual C++ automatically generates multi-lane (like ADDPS) rather than single-lane (ADDSS) instructions is due to the auto-vectorizer. See MSDN for details and pay particular attention to the /Qvec-report:2 switch--and note that this will not happen with optimizations disabled as is common in Debug configurations.
Most SIMD (multi-lane) codegen is better accomplished with explicit intrinsics usage. For a lot of examples of this style of coding, see DirectXMath.

Generate Checksum for String

I would like to Generate Checksum for Strings/Data
1. The same data should produce the same Checksum
2. Two different data strings can't product same checksum. Random collision of 0.1% can be negligible
3. No encryption/decryption of data
4. Checksum length need not be too huge and contains letters and characters.
5. Must be too fast and efficient. Imagine generating checksum(s) for 100 Mb of text data should be in less than 5mins. Generating 1000 checksums for less than 1 KB of each segment data should be in less than 10 seconds.
Any algorithm or implementation reference and suggestions are most appreciated.
You can write a custom hash function: (c++)
long long int hash(String s){
long long k = 7;
for(int i = 0; i < s.length(); i++){
k *= 23;
k += s[i];
k *= 13;
k %= 1000000009;
}
return k;
}
This should give you a well (collision free for most samples) hash value.
A very common, fast checksum is the CRC-32, a 32-bit polynomial cyclic redundancy check. Here are three implementations in C, which vary in speed vs. complexity, of the CRC-32: (This is from http://www.hackersdelight.org/hdcodetxt/crc.c.txt)
#include <stdio.h>
#include <stdlib.h>
// ---------------------------- reverse --------------------------------
// Reverses (reflects) bits in a 32-bit word.
unsigned reverse(unsigned x) {
x = ((x & 0x55555555) << 1) | ((x >> 1) & 0x55555555);
x = ((x & 0x33333333) << 2) | ((x >> 2) & 0x33333333);
x = ((x & 0x0F0F0F0F) << 4) | ((x >> 4) & 0x0F0F0F0F);
x = (x << 24) | ((x & 0xFF00) << 8) |
((x >> 8) & 0xFF00) | (x >> 24);
return x;
}
// ----------------------------- crc32a --------------------------------
/* This is the basic CRC algorithm with no optimizations. It follows the
logic circuit as closely as possible. */
unsigned int crc32a(unsigned char *message) {
int i, j;
unsigned int byte, crc;
i = 0;
crc = 0xFFFFFFFF;
while (message[i] != 0) {
byte = message[i]; // Get next byte.
byte = reverse(byte); // 32-bit reversal.
for (j = 0; j <= 7; j++) { // Do eight times.
if ((int)(crc ^ byte) < 0)
crc = (crc << 1) ^ 0x04C11DB7;
else crc = crc << 1;
byte = byte << 1; // Ready next msg bit.
}
i = i + 1;
}
return reverse(~crc);
}
// ----------------------------- crc32b --------------------------------
/* This is the basic CRC-32 calculation with some optimization but no
table lookup. The the byte reversal is avoided by shifting the crc reg
right instead of left and by using a reversed 32-bit word to represent
the polynomial.
When compiled to Cyclops with GCC, this function executes in 8 + 72n
instructions, where n is the number of bytes in the input message. It
should be doable in 4 + 61n instructions.
If the inner loop is strung out (approx. 5*8 = 40 instructions),
it would take about 6 + 46n instructions. */
unsigned int crc32b(unsigned char *message) {
int i, j;
unsigned int byte, crc, mask;
i = 0;
crc = 0xFFFFFFFF;
while (message[i] != 0) {
byte = message[i]; // Get next byte.
crc = crc ^ byte;
for (j = 7; j >= 0; j--) { // Do eight times.
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
i = i + 1;
}
return ~crc;
}
// ----------------------------- crc32c --------------------------------
/* This is derived from crc32b but does table lookup. First the table
itself is calculated, if it has not yet been set up.
Not counting the table setup (which would probably be a separate
function), when compiled to Cyclops with GCC, this function executes in
7 + 13n instructions, where n is the number of bytes in the input
message. It should be doable in 4 + 9n instructions. In any case, two
of the 13 or 9 instrucions are load byte.
This is Figure 14-7 in the text. */
unsigned int crc32c(unsigned char *message) {
int i, j;
unsigned int byte, crc, mask;
static unsigned int table[256];
/* Set up the table, if necessary. */
if (table[1] == 0) {
for (byte = 0; byte <= 255; byte++) {
crc = byte;
for (j = 7; j >= 0; j--) { // Do eight times.
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
table[byte] = crc;
}
}
/* Through with table setup, now calculate the CRC. */
i = 0;
crc = 0xFFFFFFFF;
while ((byte = message[i]) != 0) {
crc = (crc >> 8) ^ table[(crc ^ byte) & 0xFF];
i = i + 1;
}
return ~crc;
}
If you simply google "CRC32", you will get more info than you could possibly absorb.

ArrayFire frame search algorithm crash

I am new to ArrayFire and CUDA development in general, I just started using ArrayFire a couple of days ago after failing miserably using Thrust.
I am building an ArrayFire-based algorithm that is supposed to search a single 32x32 pixel frame in a database of a couple hundred thousand 32x32 frames that are stored into device memory.
At first I initialize a matrix that has 1024 + 1 pixels as rows (I need an extra one to keep a frame group id) and a predefined number (this case 1000) of frames, indexed by coloumn.
Here's the function that performs the search, if I uncomment "pixels_uint32 = device_frame_ptr[pixel_group_idx];" the program crashes. The pointer seems to be valid so I do not understand why this happens. Maybe there is something I do not know regarding accessing device memory in this way?
#include <iostream>
#include <stdio.h>
#include <sys/types.h>
#include <arrayfire.h>
#include "utils.h"
using namespace af;
using namespace std;
/////////////////////////// CUDA settings ////////////////////////////////
#define TEST_DEBUG false
#define MAX_NUMBER_OF_FRAMES 1000 // maximum (2499999 frames) X (1024 + 1 pixels per frame) x (2 bytes per pixel) = 5.124.997.950 bytes (~ 5GB)
#define BLOB_FINGERPRINT_SIZE 1024 //32x32
//percentage of macroblocks that should match: 0.9 means 90%
#define MACROBLOCK_COMPARISON_OVERALL_THRESHOLD 768 //1024 * 0.75
//////////////////////// End of CUDA settings ////////////////////////////
array search_frame(array d_db_vec)
{
try {
uint number_of_uint32_for_frame = BLOB_FINGERPRINT_SIZE / 2;
// create one-element array to hold the result of the computation
array frame_found(1,MAX_NUMBER_OF_FRAMES, u32);
frame_found = 0;
gfor (array frame_idx, MAX_NUMBER_OF_FRAMES) {
// get the blob id it's the last coloumn of the matrix
array blob_id = d_db_vec(number_of_uint32_for_frame, frame_idx); // addressing with (pixel_idx, frame_idx)
// define some hardcoded pixel to search for
uint8_t searched_r = 0x0;
uint8_t searched_g = 0x3F;
uint8_t searched_b = 0x0;
uint8_t b1 = 0;
uint8_t g1 = 0;
uint8_t r1 = 0;
uint8_t b2 = 0;
uint8_t g2 = 0;
uint8_t r2 = 0;
uint32_t sum1 = 0;
uint32_t sum2 = 0;
uint32_t *device_frame_ptr = NULL;
uint32_t pixels_uint32 = 0;
uint pixel_match_counter = 0;
//uint pixel_match_counter = 0;
array frame = d_db_vec(span, frame_idx);
device_frame_ptr = frame.device<uint32_t>();
for (uint pixel_group_idx = 0; pixel_group_idx < number_of_uint32_for_frame; pixel_group_idx++) {
// test to see if the whole matrix is traversed
// d_db_vec(pixel_group_idx, frame_idx) = 0;
/////////////////////////////// PROBLEMATIC CODE ///////////////////////////////////
pixels_uint32 = 0x7E007E0;
//pixels_uint32 = device_frame_ptr[pixel_group_idx]; //why does this crash the program?
// if I uncomment the above line the program tries to copy the u32 frame into the pixels_uint32 variable
// something goes wrong, since the pointer device_frame_ptr is not NULL and the elements should be there judging by the lines above
////////////////////////////////////////////////////////////////////////////////////
// splitting the first pixel into its components
b1 = (pixels_uint32 & 0xF8000000) >> 27; //(input & 11111000000000000000000000000000)
g1 = (pixels_uint32 & 0x07E00000) >> 21; //(input & 00000111111000000000000000000000)
r1 = (pixels_uint32 & 0x001F0000) >> 16; //(input & 00000000000111110000000000000000)
// splitting the second pixel into its components
b2 = (pixels_uint32 & 0xF800) >> 11; //(input & 00000000000000001111100000000000)
g2 = (pixels_uint32 & 0x07E0) >> 5; //(input & 00000000000000000000011111100000)
r2 = (pixels_uint32 & 0x001F); //(input & 00000000000000000000000000011111)
// checking if they are a match
sum1 = abs(searched_r - r1) + abs(searched_g - g1) + abs(searched_b - b1);
sum2 = abs(searched_r - r2) + abs(searched_g - g2) + abs(searched_b - b2);
// if they match, increment the local counter
pixel_match_counter = (sum1 <= 16) ? pixel_match_counter + 1 : pixel_match_counter;
pixel_match_counter = (sum2 <= 16) ? pixel_match_counter + 1 : pixel_match_counter;
}
bool is_found = pixel_match_counter > MACROBLOCK_COMPARISON_OVERALL_THRESHOLD;
// write down if the frame is a match or not
frame_found(0,frame_idx) = is_found ? frame_found(0,frame_idx) : blob_id;
}
// test to see if the whole matrix is traversed - this has to print zeroes
if (TEST_DEBUG)
print(d_db_vec);
// return the matches array
return frame_found;
} catch (af::exception& e) {
fprintf(stderr, "%s\n", e.what());
throw;
}
}
// make 2 green pixels
uint32_t make_test_pixel_group() {
uint32_t b1 = 0x0; //11111000000000000000000000000000
uint32_t g1 = 0x7E00000; //00000111111000000000000000000000
uint32_t r1 = 0x0; //00000000000111110000000000000000
uint32_t b2 = 0x0; //00000000000000001111100000000000
uint32_t g2 = 0x7E0; //00000000000000000000011111100000
uint32_t r2 = 0x0; //00000000000000000000000000011111
uint32_t green_pix = b1 | g1 | r1 | b2 | g2 | r2;
return green_pix;
}
int main(int argc, char ** argv)
{
info();
/////////////////////////////////////// CREATE THE DATABASE ///////////////////////////////////////
uint number_of_uint32_for_frame = BLOB_FINGERPRINT_SIZE / 2;
array d_db_vec(number_of_uint32_for_frame + 1, // fingerprint size + 1 extra u32 for blob id
MAX_NUMBER_OF_FRAMES, // number of frames
u32); // type of elements is 32-bit unsigned integer (unsigned) with the configuration RGBRGB (565565)
if (TEST_DEBUG == true) {
for (uint frame_idx = 0; frame_idx < MAX_NUMBER_OF_FRAMES; frame_idx++) {
for (uint pix_idx = 0; pix_idx < number_of_uint32_for_frame; pix_idx++) {
d_db_vec(pix_idx, frame_idx) = make_test_pixel_group(); // fill everything with green :D
}
}
} else {
d_db_vec = rand(number_of_uint32_for_frame + 1, MAX_NUMBER_OF_FRAMES);
}
cout << "Setting blob ids. \n\n";
for (uint frame_idx = 0; frame_idx < MAX_NUMBER_OF_FRAMES; frame_idx++) {
// set the blob id to 123456
d_db_vec(number_of_uint32_for_frame, frame_idx) = 123456; // blob_id = 123456
}
if (TEST_DEBUG)
print(d_db_vec);
cout << "Done setting blob ids. \n\n";
//////////////////////////////////// CREATE THE SEARCHED FRAME ///////////////////////////////////
// to be done, for now we use the hardcoded values at line 37-39 to simulate the searched pixel:
//37 uint8_t searched_r = 0x0;
//38 uint8_t searched_g = 0x3F;
//39 uint8_t searched_b = 0x0;
///////////////////////////////////////////// SEARCH /////////////////////////////////////////////
clock_t timer = startTimer();
for (int i = 0; i< 1000; i++) {
array frame_found = search_frame(d_db_vec);
if (TEST_DEBUG)
print(frame_found);
}
stopTimer(timer);
return 0;
}
Here is the console output with the line commented:
arrayfire/examples/helloworld$ ./helloworld
ArrayFire v1.9.1 (64-bit Linux, build 9af23ea)
License: Server (27000#server.accelereyes.com)
CUDA toolkit 5.0, driver 304.54
GPU0 Tesla C2075, 5376 MB, Compute 2.0
Memory Usage: 5312 MB free (5376 MB total)
Setting blob ids.
Done setting blob ids.
Time: 0.03 seconds.
Here is the console output with the line uncommented:
arrayfire/examples/helloworld$ ./helloworld
ArrayFire v1.9.1 (64-bit Linux, build 9af23ea)
License: Server (27000#server.accelereyes.com)
CUDA toolkit 5.0, driver 304.54
GPU0 Tesla C2075, 5376 MB, Compute 2.0
Memory Usage: 5312 MB free (5376 MB total)
Setting blob ids.
Done setting blob ids.
Segmentation fault
Thanks in advance for any help on this issue. I really tried everything but without success.
Disclaimer: I am the lead developer of arrayfire. I see that you have posted on AccelerEyes forums as well, but I am posting here to clear up some common issues with your code.
Do not use .device(), .host(), .scalar() inside gfor loop. This will cause divergences inside the GFOR loop, and GFOR was not designed for this.
You can not index into a device pointer. The pointer refers to a location on the GPU. When you do device_frame_ptr[pixel_group_idx];, the system is looking for the equivalent position on the CPU. This is the reason for your segmentation fault.
Use vectorized code. For example, you don't need the inner for loop of the gfor. Instead of doing b1 = (pixels_uint32 & 0xF8000000) >> 27; inside a for loop, You can do array B1 = (frame & 0xF800000000) >> 27;. i.e. instead of getting data back to CPU and using a for loop, you are doing the entire operation inside the GPU.
Don't use if-else or ternary operators inside GFOR. These cause divergences again. For example, pixel_match_counter = sum(sum1 <= 16) + sum(sum2 < 16); and found(0, found_idx) = is_found * found(0, found_idx) + (1 - is_found) * blob_id.
I have answered the particular problem you are facing. If you have any follow up questions, please follow up on our forums and / or our support email. Stackoverflow is good for asking a specific question, but not to debug your entire program.

OpenCL image2d_t writing mostly zeros

I am trying to use OpenCL and image2d_t objects to speed up image convolution. When I noticed that the output was a blank image of all zeros, I simplified the OpenCL kernel to a basic read from the input and write to the output (shown below). With a little bit of tweaking, I got it to write a few scattered pixels of the image into the output image.
I have verified that the image is intact up until the call to read_imageui() in the OpenCL kernel. I wrote the image to GPU memory with CommandQueue::enqueueWriteImage() and immediately read it back into a brand new buffer in CPU memory with CommandQueue::enqueueReadImage(). The result of this call matched the original input image. However, when I retrieve the pixels with read_imageui() in the kernel, the vast majority of the pixels are set to 0.
C++ source:
int height = 112;
int width = 9216;
unsigned int numPixels = height * width;
unsigned int numInputBytes = numPixels * sizeof(uint16_t);
unsigned int numDuplicatedInputBytes = numInputBytes * 4;
unsigned int numOutputBytes = numPixels * sizeof(int32_t);
cl::size_t<3> origin;
origin.push_back(0);
origin.push_back(0);
origin.push_back(0);
cl::size_t<3> region;
region.push_back(width);
region.push_back(height);
region.push_back(1);
std::ifstream imageFile("hri_vis_scan.dat", std::ifstream::binary);
checkErr(imageFile.is_open() ? CL_SUCCESS : -1, "hri_vis_scan.dat");
uint16_t *image = new uint16_t[numPixels];
imageFile.read((char *) image, numInputBytes);
imageFile.close();
// duplicate our single channel image into all 4 channels for Image2D
cl_ushort4 *imageDuplicated = new cl_ushort4[numPixels];
for (int i = 0; i < numPixels; i++)
for (int j = 0; j < 4; j++)
imageDuplicated[i].s[j] = image[i];
cl::Buffer imageBufferOut(context, CL_MEM_WRITE_ONLY, numOutputBytes, NULL, &err);
checkErr(err, "Buffer::Buffer()");
cl::ImageFormat inFormat;
inFormat.image_channel_data_type = CL_UNSIGNED_INT16;
inFormat.image_channel_order = CL_RGBA;
cl::Image2D bufferIn(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inFormat, width, height, 0, imageDuplicated, &err);
checkErr(err, "Image2D::Image2D()");
cl::ImageFormat outFormat;
outFormat.image_channel_data_type = CL_UNSIGNED_INT16;
outFormat.image_channel_order = CL_RGBA;
cl::Image2D bufferOut(context, CL_MEM_WRITE_ONLY, outFormat, width, height, 0, NULL, &err);
checkErr(err, "Image2D::Image2D()");
int32_t *imageResult = new int32_t[numPixels];
memset(imageResult, 0, numOutputBytes);
cl_int4 *imageResultDuplicated = new cl_int4[numPixels];
for (int i = 0; i < numPixels; i++)
for (int j = 0; j < 4; j++)
imageResultDuplicated[i].s[j] = 0;
std::ifstream kernelFile("convolutionKernel.cl");
checkErr(kernelFile.is_open() ? CL_SUCCESS : -1, "convolutionKernel.cl");
std::string imageProg(std::istreambuf_iterator<char>(kernelFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources imageSource(1, std::make_pair(imageProg.c_str(), imageProg.length() + 1));
cl::Program imageProgram(context, imageSource);
err = imageProgram.build(devices, "");
checkErr(err, "Program::build()");
cl::Kernel basic(imageProgram, "basic", &err);
checkErr(err, "Kernel::Kernel()");
basic.setArg(0, bufferIn);
basic.setArg(1, bufferOut);
basic.setArg(2, imageBufferOut);
queue.finish();
cl_ushort4 *imageDuplicatedTest = new cl_ushort4[numPixels];
for (int i = 0; i < numPixels; i++)
{
imageDuplicatedTest[i].s[0] = 0;
imageDuplicatedTest[i].s[1] = 0;
imageDuplicatedTest[i].s[2] = 0;
imageDuplicatedTest[i].s[3] = 0;
}
double gpuTimer = clock();
err = queue.enqueueReadImage(bufferIn, CL_FALSE, origin, region, 0, 0, imageDuplicatedTest, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadImage()");
// Output from above matches input image
err = queue.enqueueNDRangeKernel(basic, cl::NullRange, cl::NDRange(height, width), cl::NDRange(1, 1), NULL, NULL);
checkErr(err, "CommandQueue::enqueueNDRangeKernel()");
queue.flush();
err = queue.enqueueReadImage(bufferOut, CL_TRUE, origin, region, 0, 0, imageResultDuplicated, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadImage()");
queue.flush();
err = queue.enqueueReadBuffer(imageBufferOut, CL_TRUE, 0, numOutputBytes, imageResult, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadBuffer()");
queue.finish();
OpenCL kernel:
__kernel void basic(__read_only image2d_t input, __write_only image2d_t output, __global int *result)
{
const sampler_t smp = CLK_NORMALIZED_COORDS_TRUE | //Natural coordinates
CLK_ADDRESS_NONE | //Clamp to zeros
CLK_FILTER_NEAREST; //Don't interpolate
int2 coord = (get_global_id(1), get_global_id(0));
uint4 pixel = read_imageui(input, smp, coord);
result[coord.s0 + coord.s1 * 9216] = pixel.s0;
write_imageui(output, coord, pixel);
}
The coordinates in the kernel are currently mapped to (x, y) = (width, height).
The input image is a single channel greyscale image with 16 bits per pixel, which is why I had to duplicate the channels to fit into OpenCL's Image2D. The output after convolution will be 32 bits per pixel, which is why numOutputBytes is set to that. Also, although the width and height appear weird, the input image's dimensions are 9216x7824, so I'm only taking a portion of it to test the code first, so it doesn't take forever.
I added in a write to global memory after reading from the image in the kernel to see if the issue was reading the image or writing the image. After the kernel executes, this section of global memory also contains mostly zeros.
Any help would be greatly appreciated!
The documentation for read_imageui states that
Furthermore, the read_imagei and read_imageui calls that take integer coordinates must use a sampler with normalized coordinates set to CLK_NORMALIZED_COORDS_FALSE and addressing mode set to CLK_ADDRESS_CLAMP_TO_EDGE, CLK_ADDRESS_CLAMP or CLK_ADDRESS_NONE; otherwise the values returned are undefined.
But you're creating a sampler with CLK_NORMALIZED_COORDS_TRUE (but seem to be passing in non-normalized coords :S ?).

Parallel optimization of a checksum algorithm?

The below code sample is an implementation of CRC-CCITT that I'm using in one of my projects.
public static unsafe ushort CRC(byte * it, byte * end)
{
unchecked
{
ushort crc = 0xFFFF;
ushort quick = 0;
for (;;)
{
ushort tmp = (ushort)((crc >> 8) ^ (*it));
crc <<= 8;
quick = (ushort)(tmp ^ (tmp >> 4));
crc ^= quick;
quick <<= 5;
crc ^= quick;
quick <<= 7;
crc ^= quick;
if (it == end)
break;
it++;
}
return crc;
}
}
The CRC-CCITT uses the following polynominal formula :
(X^16 + X^12 + X^5 + 1)
Q: The above polynominal is nothing more then a series of add/multiplication operations. The basic laws of mathematics state that add/multiply ops are interchangeable etc. so expressions like :
SUM(from 1 to 10) == SUM(from 1 to 5) + SUM(from 6 to 10) are true.
I need to optimize the above code, it is probably the most frequently called thing in my project, (120 times/sec at least). Having considered the above, would this be doable with a CRC checksum ? I'm considering using Parallel.For(...) to do the trick, does that even make sense? Anyone have any suggestions?
Update :
120 times per connection actually. I'm handling at least 15 simultaneous incoming connections with datarates of 120[Hz] etc. Byte arrays can vary - theoretical max = 65k bytes, but that's rarely the case, most often it's circa 1k bytes.
Could this solve your problem?
(Sorry, I don't know how to circumvent a deleted post!)
// inspired by http://automationwiki.com/index.php?title=CRC-16-CCITT
static ushort[] crc_table = {
0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5,
0x60c6, 0x70e7, 0x8108, 0x9129, 0xa14a, 0xb16b,
0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, 0x1231, 0x0210,
0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6,
0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c,
0xf3ff, 0xe3de, 0x2462, 0x3443, 0x0420, 0x1401,
0x64e6, 0x74c7, 0x44a4, 0x5485, 0xa56a, 0xb54b,
0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d,
0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6,
0x5695, 0x46b4, 0xb75b, 0xa77a, 0x9719, 0x8738,
0xf7df, 0xe7fe, 0xd79d, 0xc7bc, 0x48c4, 0x58e5,
0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823,
0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969,
0xa90a, 0xb92b, 0x5af5, 0x4ad4, 0x7ab7, 0x6a96,
0x1a71, 0x0a50, 0x3a33, 0x2a12, 0xdbfd, 0xcbdc,
0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a,
0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03,
0x0c60, 0x1c41, 0xedae, 0xfd8f, 0xcdec, 0xddcd,
0xad2a, 0xbd0b, 0x8d68, 0x9d49, 0x7e97, 0x6eb6,
0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70,
0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a,
0x9f59, 0x8f78, 0x9188, 0x81a9, 0xb1ca, 0xa1eb,
0xd10c, 0xc12d, 0xf14e, 0xe16f, 0x1080, 0x00a1,
0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067,
0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c,
0xe37f, 0xf35e, 0x02b1, 0x1290, 0x22f3, 0x32d2,
0x4235, 0x5214, 0x6277, 0x7256, 0xb5ea, 0xa5cb,
0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d,
0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447,
0x5424, 0x4405, 0xa7db, 0xb7fa, 0x8799, 0x97b8,
0xe75f, 0xf77e, 0xc71d, 0xd73c, 0x26d3, 0x36f2,
0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634,
0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9,
0xb98a, 0xa9ab, 0x5844, 0x4865, 0x7806, 0x6827,
0x18c0, 0x08e1, 0x3882, 0x28a3, 0xcb7d, 0xdb5c,
0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a,
0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0,
0x2ab3, 0x3a92, 0xfd2e, 0xed0f, 0xdd6c, 0xcd4d,
0xbdaa, 0xad8b, 0x9de8, 0x8dc9, 0x7c26, 0x6c07,
0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1,
0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba,
0x8fd9, 0x9ff8, 0x6e17, 0x7e36, 0x4e55, 0x5e74,
0x2e93, 0x3eb2, 0x0ed1, 0x1ef0
};
public static unsafe ushort CRC(byte* it, byte* end)
{
ushort crc = 0;
ushort temp;
do
{
temp = (ushort)((*it ^ (crc >> 8)) & 0xff);
crc = (ushort)(crc_table[temp] ^ (crc << 8));
}
while (it++ != end);
return crc;
}

Resources