SENSOR_FRAME_DURATION not changing to 1/60 - android-ndk

I am trying to change SENSOR_FRAME_DURATION from 1/30 to 1/60.
First I have set CONTROL_AE_MODE to CONTROL_AE_MODE_OFF.
uint8_t aeMode = ACAMERA_CONTROL_AE_MODE_OFF;
ACaptureRequest_setEntry_u8(capture_request, ACAMERA_CONTROL_AE_MODE, 1, &aeMode)
Then updating sensor frame duration
int64_t framedur = 16666666;
ACaptureRequest_setEntry_i64(capture_request, ACAMERA_SENSOR_FRAME_DURATION, 1, &framedur)
int64_t expTime = 83222;
ACaptureRequest_setEntry_i64(capture_request, ACAMERA_SENSOR_EXPOSURE_TIME, 1, &expTime)
int64_t sensitivity = 56;
ACaptureRequest_setEntry_i32(capture_request, ACAMERA_SENSOR_SENSITIVITY, 1, &sensitivity)
If I get these values immediately, I get the values which are set recently.
But in onCaptureCompleted callback, SENSOR_FRAME_DURATION is 33323892 and not 16666666.
Can anyone suggest why this can happen.

Related

Search over an array of 14 integers, build a mask and return the match on ARMv8a using NEON

For my open source project cachegrand we are implementing AARCH64 support and although most of the port is completed we are sorting out a feature to perform an accelerated array search using NEON instructions.
The logic we use is pretty simple:
in input there is an array of 14 uint32 elements, the value to find and a mask to ignore certain matches
the code has to find any value that matches a specific uint32
build a bitmask
the least significant bits of the bitmask match the begin of the array
the bitmask is then & with the skip indices mask
and then the trailing zeros are counted to determine the index of the first occurance
It's a very rare occurance that the skip indices mask is actually used, I would say that 99.9% of the cases will be zero.
I have come up with the following implementation, but I have no experience with ARMv8 NEON instruction and feels a bit clunky, especially so I was wondering if there is a way to make it faster and/or better.
For reference, currently the code is compiled only with GCC.
uint8_t hashtable_mcmp_support_hash_search_armv8a_neon_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask) {
uint32x4_t tmp;
uint32_t compacted_result_mask = 0;
uint32_t skip_indexes_mask_inv = ~skip_indexes_mask;
static const int32x4_t shift = {0, 1, 2, 3};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
uint32x4_t ring_vector_0_3 = vld1q_u32((hashtable_hash_half_t*)hashes + 0);
uint32x4_t cmp_vector_0_3 = vceqq_u32(ring_vector_0_3, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_0_3, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 0;
uint32x4_t ring_vector_4_7 = vld1q_u32((hashtable_hash_half_t*)hashes + 4);
uint32x4_t cmp_vector_4_7 = vceqq_u32(ring_vector_4_7, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_4_7, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 4;
uint32x4_t ring_vector_8_11 = vld1q_u32((hashtable_hash_half_t*)hashes + 8);
uint32x4_t cmp_vector_8_11 = vceqq_u32(ring_vector_8_11, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_8_11, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 8;
uint32x4_t ring_vector_10_13 = vld1q_u32((hashtable_hash_half_t*)hashes + 10);
uint32x4_t cmp_vector_10_13 = vceqq_u32(ring_vector_10_13, cmp_vector);
tmp = vshrq_n_u32(cmp_vector_10_13, 31);
compacted_result_mask |= vaddvq_u32(vshlq_u32(tmp, shift)) << 10;
return __builtin_ctz(compacted_result_mask & skip_indexes_mask_inv);
}
Just for reference, here the AVX2 code
static inline uint8_t hashtable_mcmp_support_hash_search_avx2_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask) {
uint32_t compacted_result_mask = 0;
uint32_t skip_indexes_mask_inv = ~skip_indexes_mask;
__m256i cmp_vector = _mm256_set1_epi32(hash);
// The second load, load from the 6th uint32 to the 14th uint32, _mm256_loadu_si256 always loads 8 x uint32
for(uint8_t base_index = 0; base_index < 12; base_index += 6) {
__m256i ring_vector = _mm256_loadu_si256((__m256i*) (hashes + base_index));
__m256i result_mask_vector = _mm256_cmpeq_epi32(ring_vector, cmp_vector);
// Uses _mm256_movemask_ps to reduce the bandwidth
compacted_result_mask |= (uint32_t)_mm256_movemask_ps(_mm256_castsi256_ps(result_mask_vector)) << (base_index);
}
return _tzcnt_u32(compacted_result_mask & skip_indexes_mask_inv);
}
On a side question, do you think it's worth to implement support for SVE2 instructions? Especially taking into account that this is a pretty simple operation and looks like there might not be mandatory support for 256 bits registers (which probably would be the biggest benefit of using SVE2 in this specific context)
Booleans don't need 32 bits each: shrink them to 8 bits ASAP by vuzp1 and vomovn prior to doing further operations.
uint8_t hashtable_mcmp_support_hash_search_armv8a_neon_14(
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask)
{
uint16x8_t tmp16a, tmp16b;
uint8x8_t tmp8a, tmp8b;
uint32_t tmp;
static const uint8x8_t mask = {1, 2, 4, 8, 16, 32, 64, 128};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
uint32x4x3_t ring_vector_0_11 = vld1q_u32_x3((uint32_t *)hashes);
uint32x4_t ring_vector_10_13 = vld1q_u32((uint32_t *)hashes+10);
ring_vector_0_11.val[0] = vceqq_u32(ring_vector_0_11.val[0], cmp_vector);
ring_vector_0_11.val[1] = vceqq_u32(ring_vector_0_11.val[1], cmp_vector);
ring_vector_0_11.val[2] = vceqq_u32(ring_vector_0_11.val[2], cmp_vector);
ring_vector_10_13 = vceqq_u32(ring_vector_10_13, cmp_vector);
tmp16a = vuzp1q_u16(ring_vector_0_11.val[0], ring_vector_0_11.val[1]);
tmp16b = vuzp1q_u16(ring_vector_0_11.val[2], ring_vector_10_13);
tmp8a = vmovn_u16(tmp16a);
tmp8b = vmovn_u16(tmp16b);
tmp8a = vand_u8(tmp8a, mask);
tmp8b = vand_u8(tmp8b, mask);
tmp = (uint32_t)vaddv_u8(tmp8a) | (uint32_t)(vaddv_u8(tmp8b)<<8);
return __builtin_ctz(tmp &~ skip_indexes_mask);
}
And I don't think sve will bring a meaningful performance boost since the performance is more or less crippled at the end (vaddv and especially the transfer to arm registers)
If you are dealing with thousands of 14 entry arrays, you should consider redesigning your function to writing into an 8bit array instead of returning in arm register each and every time. That will eliminate the most time consuming pipeline hazard caused by the Neon to arm transfer.
#include <arm_neon.h>
#include <arm_acle.h>
void hashtable_mcmp_support_hash_search_armv8a_neon_14_b(
uint8_t *pDst,
uint32_t hash,
volatile uint32_t* hashes,
uint32_t skip_indexes_mask, uint32_t number_of_arrays)
{
uint16x8_t tmp16a, tmp16b;
uint16x4_t tmp;
uint8x8_t tmp8a, tmp8b;
static const uint8x8_t mask = {128, 64, 32, 16, 8, 4, 2, 1};
uint32x4_t cmp_vector = vdupq_n_u32(hash);
skip_indexes_mask = __rbit(skip_indexes_mask)>>16;
uint16x4_t index_mask = vdup_n_u16((uint16_t) skip_indexes_mask);
uint32x4x4_t ring_vector;
while (number_of_arrays--)
{
ring_vector = vld1q_u32_x4((uint32_t *)hashes);
hashes += 16;
ring_vector.val[0] = vceqq_u32(ring_vector.val[0], cmp_vector);
ring_vector.val[1] = vceqq_u32(ring_vector.val[1], cmp_vector);
ring_vector.val[2] = vceqq_u32(ring_vector.val[2], cmp_vector);
ring_vector.val[3] = vceqq_u32(ring_vector.val[3], cmp_vector);
tmp16a = vuzp1q_u16(vreinterpretq_u16_u32(ring_vector.val[0]), vreinterpretq_u16_u32(ring_vector.val[1]));
tmp16b = vuzp1q_u16(vreinterpretq_u16_u32(ring_vector.val[2]), vreinterpretq_u16_u32(ring_vector.val[3]));
tmp8a = vmovn_u16(tmp16a);
tmp8b = vmovn_u16(tmp16b);
tmp8a = vand_u8(tmp8a, mask);
tmp8b = vand_u8(tmp8b, mask);
tmp8a[1] = vaddv_u8(tmp8a);
tmp8a[0] = vaddv_u8(tmp8b);
tmp = vbic_u16(vreinterpret_u16_u8(tmp8a), index_mask);
tmp = vclz_u16(tmp);
vst1_lane_u8(pDst++,vreinterpret_u8_u16(tmp), 0);
}
}
Above is an "improved" version
It assumes the arrays to be in contiguous memory with 8 bytes padding which is perferrable for the cache efficiency unless the memory requirement is a problem.
Instead of returning an 8bit result, it writes the results into memory directly, avoiding pipeline hazards caused by neon to arm transfer.
It still suffers from vaddv latency(8 cycles). You can unroll the loop so that it processes 2 or even 4 arrays per iteration in order to hide that latency.

Vulkan: Vertex Buffer doesn't get sent to vertex shader

I am learning Vulkan and started having a problem where no vertices would get displayed.
After analyzing my program with RenderDoc (https://renderdoc.org/builds),
I realized that the buffer containing the vertex and index information contained the rights values.
At the end of the same buffer, the indices data:
The problem is that when I check the data that is transmitted to the vertex shader, it is empty:
Here is the command buffer section where it is supposed to send the data to the shader:
VkDeviceSize indicesOffset = sizeof(Vertex) * this->nbVertices;
VkDeviceSize offsets[] = {0};
vkCmdBindVertexBuffers(commandBuffers[i], 0, 1, &this->vertexBuffer, offsets);
vkCmdBindIndexBuffer(commandBuffers[i], this->vertexBuffer, indicesOffset, VK_INDEX_TYPE_UINT32);
for(size_t j = 0 ; j < this->models.size() ; j++){
Model *model = this->models[j];
uint32_t modelDynamicOffset = j * static_cast<uint32_t>(this->uniformDynamicAlignment);
VkDescriptorSet* modelDescriptorSet = model->getDescriptorSet(i);
vkCmdBindDescriptorSets(this->commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, modelDescriptorSet, 1, &modelDynamicOffset);
vkCmdDrawIndexed(commandBuffers[i], this->nbIndices, 1, 0, indicesOffset, 0);
}
Also, here is how I create the vertex buffer:
void Application::createVertexBuffers() {
for(Model *model : this->models){
for(Vertex vertex : model->getVertices()){
vertices.push_back(vertex);
}
for(uint32_t index : model->getIndices()){
indices.push_back(index);
}
}
VkDeviceSize vertexBufferSize = sizeof(vertices[0]) * vertices.size();
VkDeviceSize indexBufferSize = sizeof(uint32_t) * indices.size();
this->nbVertices = vertices.size();
this->nbIndices = indices.size();
VkBuffer stagingBuffer;
VkDeviceMemory stagingBufferMemory;
//To CPU
this->createBuffer(vertexBufferSize + indexBufferSize,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
stagingBuffer,
stagingBufferMemory);
void *data;
vkMapMemory(device, stagingBufferMemory, 0, vertexBufferSize, 0, &data);
memcpy(data, vertices.data(), (size_t)vertexBufferSize);
vkUnmapMemory(device, stagingBufferMemory);
//Add the index data after vertex data
vkMapMemory(device, stagingBufferMemory, vertexBufferSize, indexBufferSize, 0, &data);
memcpy(data, indices.data(), (size_t)indexBufferSize);
vkUnmapMemory(device, stagingBufferMemory);
//To GPU
this->createBuffer(vertexBufferSize + indexBufferSize,
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
this->vertexBuffer,
this->vertexBufferMemory);
this->copyBuffer(stagingBuffer, this->vertexBuffer, vertexBufferSize + indexBufferSize);
vkDestroyBuffer(device, stagingBuffer, nullptr);
vkFreeMemory(device, stagingBufferMemory, nullptr);
}
If you need more information to help me solve my problem, please tell me.
Thank you.
The indices that renderdoc reports for the render are a bit high.
You pass indicesOffset as vertexOffset in your draw command. Which is:
vertexOffset is the value added to the vertex index before indexing into the vertex buffer.
So replace that with 0 and you should get your proper vertices again.

XLL command (not UDF): xlSet fails to set multiple values

I'm pretty new to XLL programming but I believe I've done my homework on this little problem. I'm trying to set multiple cell values using xlSet but xlSet just duplicates the first value of my array, as if I had passed it a single reference. If instead I consecutively call xlSet for each cell individually, it works. But it's ugly and, no doubt, slow.
I know xlSet can't be called from a UDF. This isn't a UDF. Elsewhere I saw that somebody had a struct alignment problem in the xlMulti, but I'm sending the same xlMulti back to Excel, so that's not a problem. (I tried the /Zp8 compiler switch, anyway.)
I started with SDK Framework stuff, eg., TempActiveRef(1,1,0,2) and then replaced those calls with the more direct XLCALL.H stuff, mainly to increase my chances of getting responses here.
Office 2010, VSTO 2010, Win7 64bit SP1.
Here's what works and what doesn't:
__declspec(dllexport) int WINAPI testCmd(void)
{
XLOPER12 ref, xValsInMulti, xResSet;
XLMREF12 mref;
int i, res;
// Build an xltypeRef XLOPER12 that points to three cells, A2:C2
res = Excel12(xlSheetId, &ref, 0);
if (res != xlretSuccess) return 0;
ref.xltype = xltypeRef;
ref.val.mref.lpmref = &mref;
mref.count = 1;
mref.reftbl[0].rwFirst = mref.reftbl[0].rwLast = 1;
mref.reftbl[0].colFirst = 0;
mref.reftbl[0].colLast = 2;
// Fetch the cell values into an xltypeMulti.
// This works. Returns 0. And xValsInMulti.type becomes xlTypeMulti
res = Excel12(xlCoerce, &xValsInMulti, 1, &ref );
// Change cell reference to the next row (A3:C3)
mref.reftbl[0].rwFirst = mref.reftbl[0].rwLast = 2;
// Attempt to set the values. Doesn't work. All cells become value of A2.
Excel12(xlSet, &xResSet, 2, &ref, xValsInMulti);
Excel12(xlcAlert, 0, 1, &xResSet); // Displays "TRUE"
// Try again (in the next row) setting each cell individually. This works.
mref.reftbl[0].rwFirst = mref.reftbl[0].rwLast = 3;
for (i=0; i<3; i++)
{
mref.reftbl[0].colFirst = mref.reftbl[0].colLast = i;
Excel12(
xlSet, &xResSet, 2, &ref, xValsInMulti.val.array.lparray+i
);
Excel12(xlcAlert, 0, 1, &xResSet); // Displays "TRUE"
}
Excel12(xlFree, 0, 1, &xValsInMulti);
return 1;
}

A simple Vertex Buffer Object (C++) that doesnt render

Im trying to use VBOs to render just a normal 2d textured square onto an FBO. Immediate mode functions work flawlessly but not this VBO. GL_TEXTURE_2D is already enabled for the code. What is wrong with it?
unsigned int VBOid = 0;
unsigned int Iid = 0;
float *geometry;
unsigned int *indices;
int num_geometry = 1;
int num_vertices = 4;
int num_indices = num_geometry*num_vertices;
geometry = new float[num_geometry*num_vertices*4];
indices = new unsigned int[num_indices];
indices[0] = 0;
indices[1] = 1;
indices[2] = 2;
indices[3] = 3;
/* Fill geometry: 0, 1, = vertex_xy
* 2, 3 = tex_coord_uv
*/
geometry[0] = 0.0f;
geometry[1] = 0.0f;
geometry[2] = 0.0f;
geometry[3] = 0.0f;
geometry[4] = 50.0f;
geometry[5] = 0.0f;
geometry[6] = 1.0f;
geometry[7] = 0.0f;
geometry[8] = 50.0f;
geometry[9] = 50.0f;
geometry[10] = 1.0f;
geometry[11] = 1.0f;
geometry[12] = 0.0f;
geometry[13] = 50.0f;
geometry[14] = 0.0f;
geometry[15] = 1.0f;
glGenBuffers(1, &VBOid);
glBindBuffer(GL_ARRAY_BUFFER, VBOid);
glBufferData(GL_ARRAY_BUFFER, sizeof(geometry), geometry, GL_STATIC_DRAW);
glGenBuffers(1, &Iid);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, Iid);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
//GL_TEXTURE_2D is already enabled here
//Buffers are already bound from above
glBindTexture( GL_TEXTURE_2D, 2); //I used 2 just to test to see if it is rendering a texture correctly. Yes, 2 does exist in my program thats why i arbitrarily used it
//glClientActiveTexture(GL_TEXTURE0); I dont know what this is for and where to put it
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
//glActiveTexture(GL_TEXTURE0); same here I dont know what this is for or where to put it
glVertexPointer(2, GL_FLOAT, sizeof(GLfloat)*4, 0);
glTexCoordPointer(2, GL_FLOAT, sizeof(GLfloat)*4, (float*)(sizeof(GLfloat)*2));
glDrawElements(GL_QUADS, num_indices, GL_UNSIGNED_INT, indices);
glDisableClientState(GL_VERTEX_ARRAY);
glDisableClientState(GL_TEXTURE_COORD_ARRAY);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
The problem is your usage of sizeof(geometry) (and the same for indices) inside the glBufferData calls. Those variables are actually just pointers, no matter if they point to dynamically allocated arrays (which the compiler doesn't know). So you will always get the size of a pointer (4 or 8 bytes, depending on platform).
Replace sizeof(geometry) with num_geometry*num_vertices*4*sizeof(float) and sizeof(indices) with num_indices*sizeof(unsigned int). Well, in fact you don't need any indices here at all and can just draw the whole thing with a simple
glDrawArrays(GL_QUADS, 0, 4);
Always be aware of the differences between an actual (compile-time sized) array and a mere pointer pointing to a dynamicallly allocated array, with the result of the sizeof operator being one of those differences (and the requirement to free the memory of the latter using delete[] at some later point in time being another, but not less important, difference).

OpenCL image2d_t writing mostly zeros

I am trying to use OpenCL and image2d_t objects to speed up image convolution. When I noticed that the output was a blank image of all zeros, I simplified the OpenCL kernel to a basic read from the input and write to the output (shown below). With a little bit of tweaking, I got it to write a few scattered pixels of the image into the output image.
I have verified that the image is intact up until the call to read_imageui() in the OpenCL kernel. I wrote the image to GPU memory with CommandQueue::enqueueWriteImage() and immediately read it back into a brand new buffer in CPU memory with CommandQueue::enqueueReadImage(). The result of this call matched the original input image. However, when I retrieve the pixels with read_imageui() in the kernel, the vast majority of the pixels are set to 0.
C++ source:
int height = 112;
int width = 9216;
unsigned int numPixels = height * width;
unsigned int numInputBytes = numPixels * sizeof(uint16_t);
unsigned int numDuplicatedInputBytes = numInputBytes * 4;
unsigned int numOutputBytes = numPixels * sizeof(int32_t);
cl::size_t<3> origin;
origin.push_back(0);
origin.push_back(0);
origin.push_back(0);
cl::size_t<3> region;
region.push_back(width);
region.push_back(height);
region.push_back(1);
std::ifstream imageFile("hri_vis_scan.dat", std::ifstream::binary);
checkErr(imageFile.is_open() ? CL_SUCCESS : -1, "hri_vis_scan.dat");
uint16_t *image = new uint16_t[numPixels];
imageFile.read((char *) image, numInputBytes);
imageFile.close();
// duplicate our single channel image into all 4 channels for Image2D
cl_ushort4 *imageDuplicated = new cl_ushort4[numPixels];
for (int i = 0; i < numPixels; i++)
for (int j = 0; j < 4; j++)
imageDuplicated[i].s[j] = image[i];
cl::Buffer imageBufferOut(context, CL_MEM_WRITE_ONLY, numOutputBytes, NULL, &err);
checkErr(err, "Buffer::Buffer()");
cl::ImageFormat inFormat;
inFormat.image_channel_data_type = CL_UNSIGNED_INT16;
inFormat.image_channel_order = CL_RGBA;
cl::Image2D bufferIn(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inFormat, width, height, 0, imageDuplicated, &err);
checkErr(err, "Image2D::Image2D()");
cl::ImageFormat outFormat;
outFormat.image_channel_data_type = CL_UNSIGNED_INT16;
outFormat.image_channel_order = CL_RGBA;
cl::Image2D bufferOut(context, CL_MEM_WRITE_ONLY, outFormat, width, height, 0, NULL, &err);
checkErr(err, "Image2D::Image2D()");
int32_t *imageResult = new int32_t[numPixels];
memset(imageResult, 0, numOutputBytes);
cl_int4 *imageResultDuplicated = new cl_int4[numPixels];
for (int i = 0; i < numPixels; i++)
for (int j = 0; j < 4; j++)
imageResultDuplicated[i].s[j] = 0;
std::ifstream kernelFile("convolutionKernel.cl");
checkErr(kernelFile.is_open() ? CL_SUCCESS : -1, "convolutionKernel.cl");
std::string imageProg(std::istreambuf_iterator<char>(kernelFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources imageSource(1, std::make_pair(imageProg.c_str(), imageProg.length() + 1));
cl::Program imageProgram(context, imageSource);
err = imageProgram.build(devices, "");
checkErr(err, "Program::build()");
cl::Kernel basic(imageProgram, "basic", &err);
checkErr(err, "Kernel::Kernel()");
basic.setArg(0, bufferIn);
basic.setArg(1, bufferOut);
basic.setArg(2, imageBufferOut);
queue.finish();
cl_ushort4 *imageDuplicatedTest = new cl_ushort4[numPixels];
for (int i = 0; i < numPixels; i++)
{
imageDuplicatedTest[i].s[0] = 0;
imageDuplicatedTest[i].s[1] = 0;
imageDuplicatedTest[i].s[2] = 0;
imageDuplicatedTest[i].s[3] = 0;
}
double gpuTimer = clock();
err = queue.enqueueReadImage(bufferIn, CL_FALSE, origin, region, 0, 0, imageDuplicatedTest, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadImage()");
// Output from above matches input image
err = queue.enqueueNDRangeKernel(basic, cl::NullRange, cl::NDRange(height, width), cl::NDRange(1, 1), NULL, NULL);
checkErr(err, "CommandQueue::enqueueNDRangeKernel()");
queue.flush();
err = queue.enqueueReadImage(bufferOut, CL_TRUE, origin, region, 0, 0, imageResultDuplicated, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadImage()");
queue.flush();
err = queue.enqueueReadBuffer(imageBufferOut, CL_TRUE, 0, numOutputBytes, imageResult, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadBuffer()");
queue.finish();
OpenCL kernel:
__kernel void basic(__read_only image2d_t input, __write_only image2d_t output, __global int *result)
{
const sampler_t smp = CLK_NORMALIZED_COORDS_TRUE | //Natural coordinates
CLK_ADDRESS_NONE | //Clamp to zeros
CLK_FILTER_NEAREST; //Don't interpolate
int2 coord = (get_global_id(1), get_global_id(0));
uint4 pixel = read_imageui(input, smp, coord);
result[coord.s0 + coord.s1 * 9216] = pixel.s0;
write_imageui(output, coord, pixel);
}
The coordinates in the kernel are currently mapped to (x, y) = (width, height).
The input image is a single channel greyscale image with 16 bits per pixel, which is why I had to duplicate the channels to fit into OpenCL's Image2D. The output after convolution will be 32 bits per pixel, which is why numOutputBytes is set to that. Also, although the width and height appear weird, the input image's dimensions are 9216x7824, so I'm only taking a portion of it to test the code first, so it doesn't take forever.
I added in a write to global memory after reading from the image in the kernel to see if the issue was reading the image or writing the image. After the kernel executes, this section of global memory also contains mostly zeros.
Any help would be greatly appreciated!
The documentation for read_imageui states that
Furthermore, the read_imagei and read_imageui calls that take integer coordinates must use a sampler with normalized coordinates set to CLK_NORMALIZED_COORDS_FALSE and addressing mode set to CLK_ADDRESS_CLAMP_TO_EDGE, CLK_ADDRESS_CLAMP or CLK_ADDRESS_NONE; otherwise the values returned are undefined.
But you're creating a sampler with CLK_NORMALIZED_COORDS_TRUE (but seem to be passing in non-normalized coords :S ?).

Resources