Is there a way to perform signed division in eBPF? - linux

I am trying to perform signed division in eBPF but llvm is throwing unsupported error. Is there a way to perform signed division in any other way (direct/indirect) in eBPF?

eBPF doesn't have a signed division instruction in its instruction set.
You can still work around it though. Signed division is nothing more than preserving the XOR of the two sides. Meaning, a output is negative if one or the other is negative, but dividing a negative by a negative number gives a positive back.
This is what I came up with:
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
int32_t sdiv(int32_t a, int32_t b) {
bool aneg = a < 0;
bool bneg = b < 0;
// get the absolute positive value of both
uint32_t adiv = aneg ? -a : a;
uint32_t bdiv = bneg ? -b : b;
// Do udiv
uint32_t out = adiv / bdiv;
// Make output negative if one or the other is negative, not both
return aneg != bneg ? -out : out;
}
int main()
{
printf("%d\n", sdiv(100, 5));
printf("%d\n", sdiv(-100, 5));
printf("%d\n", sdiv(100, -5));
printf("%d\n", sdiv(-100, -5));
return 0;
}
I am sure there are better ways to do it, but this seems to work.

Related

How to edit ELF files in Linux?

I have written two similar C programs. How can I make the outputs of both code same by editing one of the ELF files not the actual code?
/**
* prg1.c
*/
#include <stdio.h>
#include <stdlib.h>
int main()
{
int a = 5;
int b = 10;
int sum;
sum = a + b;
printf("sum is %d\n", sum);
return(0);
}
/**
* prg2.c
*/
#include <stdio.h>
#include <stdlib.h>
int main()
{
int a = 5;
int b = 20;
int sum;
sum = a + b;
printf("sum is %d\n", sum);
return(0);
}
In your second program's elf file find the occurrence of 20 and change it to 10.
To do that you can do something like this -
Find 14 (hex of 20) in your elf file and change it to A and making sure length is same by adding extra 0.
To do this you can use any elf editor, I use 'Hex Fiend' for mac.

CUDA Programming: Compilation Error

I am making a CUDA program that implements the data parallel prefix sum calculation operating upon N numbers. My code is also supposed to generate the numbers on the host using a random number generator. However, I seem to always run into a "unrecognized token" and "expected a declaration" error on the ending bracket of int main when attempting to compile. I am running the code on Linux.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <math.h>
__global__ void gpu_cal(int *a,int i, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid>=i && tid < n) {
a[tid] = a[tid]+a[tid-i];
}
}
int main(void)
{
int key;
int *dev_a;
int N=10;//size of 1D array
int B=1;//blocks in the grid
int T=10;//threads in a block
do{
printf ("Some limitations:\n");
printf (" Maximum number of threads per block = 1024\n");
printf (" Maximum sizes of x-dimension of thread block = 1024\n");
printf (" Maximum size of each dimension of grid of thread blocks = 65535\n");
printf (" N<=B*T\n");
do{
printf("Enter size of array in one dimension, currently %d\n",N);
scanf("%d",&N);
printf("Enter size of blocks in the grid, currently %d\n",B);
scanf("%d",&B);
printf("Enter size of threads in a block, currently %d\n",T);
scanf("%d",&T);
if(N>B*T)
printf("N>B*T, this will result in an incorrect result generated by GPU, please try again\n");
if(T>1024)
printf("T>1024, this will result in an incorrect result generated by GPU, please try again\n");
}while((N>B*T)||(T>1024));
cudaEvent_t start, stop; // using cuda events to measure time
float elapsed_time_ms1, elapsed_time_ms3;
int a[N],gpu_result[N];//for result generated by GPU
int cpu_result[N];//CPU result
cudaMalloc((void**)&dev_a,N * sizeof(int));//allocate memory on GPU
int i,j;
srand(1); //initialize random number generator
for (i=0; i < N; i++) // load array with some numbers
a[i] = (int)rand() ;
cudaMemcpy(dev_a, a , N*sizeof(int),cudaMemcpyHostToDevice);//load data from host to device
cudaEventCreate(&start); // instrument code to measure start time
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//GPU computation
for(j=0;j<log(N)/log(2);j++){
gpu_cal<<<B,T>>>(dev_a,pow(2,j),N);
cudaThreadSynchronize();
}
cudaMemcpy(gpu_result,dev_a,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0); // instrument code to measue end time
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms1, start, stop );
printf("\n\n\nTime to calculate results on GPU: %f ms.\n", elapsed_time_ms1); // print out execution time
//CPU computation
cudaEventRecord(start, 0);
for(i=0;i<N;i++)
{
cpu_result[i]=0;
for(j=0;j<=i;j++)
{
cpu_result[i]=cpu_result[i]+a[j];
}
}
cudaEventRecord(stop, 0); // instrument code to measue end time
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms3, start, stop );
printf("Time to calculate results on CPU: %f ms.\n\n", elapsed_time_ms3); // print out execution time
//Error check
for(i=0;i < N;i++) {
if (gpu_result[i] != cpu_result[i] ) {
printf("ERROR!!! CPU and GPU create different answers\n");
break;
}
}
//Calculate speedup
printf("Speedup on GPU compared to CPU= %f\n", (float) elapsed_time_ms3 / (float) elapsed_time_ms1);
printf("\nN=%d",N);
printf("\nB=%d",B);
printf("\nT=%d",T);
printf("\n\n\nEnter '1' to repeat, or other integer to terminate\n");
scanf("%d",&key);
}while(key == 1);
cudaFree(dev_a);//deallocation
return 0;
}​
The very last } in your code is a Unicode character. If you delete this entire line, and retype the }, the error will be gone.
There are two compile errors in your code.
First, Last ending bracket is a unicode character, so you should resave your code as unicode or delete and rewrite the last ending bracket.
Second, int type variable N which used at this line - int a[N],gpu_result[N];//for result generated by GPU
was declared int type, but it's not allowed in c or c++ compiler, so you should change the N declaration as const int N.

C++ 11 std::thread strange behavior

I am experimenting a bit with std::thread and C++11, and I am encountering strange behaviour.
Please have a look at the following code:
#include <cstdlib>
#include <thread>
#include <vector>
#include <iostream>
void thread_sum_up(const size_t n, size_t& count) {
size_t i;
for (i = 0; i < n; ++i);
count = i;
}
class A {
public:
A(const size_t x) : x_(x) {}
size_t sum_up(const size_t num_threads) const {
size_t i;
std::vector<std::thread> threads;
std::vector<size_t> data_vector;
for (i = 0; i < num_threads; ++i) {
data_vector.push_back(0);
threads.push_back(std::thread(thread_sum_up, x_, std::ref(data_vector[i])));
}
std::cout << "Threads started ...\n";
for (i = 0; i < num_threads; ++i)
threads[i].join();
size_t sum = 0;
for (i = 0; i < num_threads; ++i)
sum += data_vector[i];
return sum;
}
private:
const size_t x_;
};
int main(int argc, char* argv[]) {
const size_t x = atoi(argv[1]);
const size_t num_threads = atoi(argv[2]);
A a(x);
std::cout << a.sum_up(num_threads) << std::endl;
return 0;
}
The main idea here is that I want to specify a number of threads which do independent computations (in this case, simple increments).
After all threads are finished, the results should be merged in order to obtain an overall result.
Just to clarify: This is only for testing purposes, in order to get me understand how
C++11 threads work.
However, when compiling this code using the command
g++ -o threads threads.cpp -pthread -O0 -std=c++0x
on a Ubuntu box, I get very strange behaviour, when I execute the resulting binary.
For example:
$ ./threads 1000 4
Threads started ...
Segmentation fault (core dumped)
(should yield the output: 4000)
$ ./threads 100000 4
Threads started ...
200000
(should yield the output: 400000)
Does anybody has an idea what is going on here?
Thank you in advance!
Your code has many problems (see even thread_sum_up for about 2-3 bugs) but the main bug I found by glancing your code is here:
data_vector.push_back(0);
threads.push_back(std::thread(thread_sum_up, x_, std::ref(data_vector[i])));
See, when you push_back into a vector (I'm talking about data_vector), it can move all previous data around in memory. But then you take the address of (reference to) a cell for your thread, and then push back again (making the previous reference invalid)
This will cause you to crash.
For an easy fix - add data_vector.reserve(num_threads); just after creating it.
Edit at your request - some bugs in thread_sum_up
void thread_sum_up(const size_t n, size_t& count) {
size_t i;
for (i = 0; i < n; ++i); // see that last ';' there? means this loop is empty. it shouldn't be there
count = i; // You're just setting count to be i. why do that in a loop? Did you mean +=?
}
The cause of your crash might be that std::ref(data_vector[i]) being invalidated by the next push_back in data_vector. Since you know the number of threads, do a data_vector.reserve(num_threads) before you start spawning off the threads to keep the references from being invalidated.
As you resize the vector with the calls to push_back, it is likely to have to reallocate the storage space, causing the references to the contained values to be invalidated. This causes the thread to write to non-allocated memory, which is undefined behavior.
Your options are to pre-allocate the size you need (vector::reserve is one option), or choose a different container.

Error message: pointer to function

I get an error message saying expression must have (pointer-to-) function type. what am i doing wrong? i just started coding, i know i suck lol. I don't understand how to get the formula for the distance to work.
#include <cmath> //headerfile
#include <iomanip>
#include <iostream>
enter code here
using namespace std;
int main()
{
double d;
double t;
double g;
char choice ='y';
//output numbers to console
while (choice == 'y' || choice =='Y')
{
cout<<"Please input a value for the time"<< endl<<endl;
cin>>t;
g = 32;
d = (g)(t*t);
if (t<0)
cout<<"You cannot have a negative time"<<endl<<endl;
else
cout<<setw(8)<<fixed<<setprecision(2)<<"\n""The distance the ball has fallen is "<<d<<" feet"<<endl<<endl;
cout<<"Would you like to run this again? y for yes, any other key for no."<< endl<<endl;
cin>>choice;
cout<<endl;
}
system ("Pause");
return 0;
}
If (g)(t*t) is supposed to be a normal multiplication operation, then it should be g*t*t.
In your code, g is a double, but you are using it as if it was a pointer to a function (d = (g)(t*t);). If what you really want is to multiply t*t by g, you forgot an *:
d = (g)*(t*t);

Generating a comprehensive callgraph using GCC & Egypt

I am trying to generate a comprehensive callgraph (complete with low level calls to Linux, runtime, the lot).
I have statically compiled my source files with "-fdump-rtl-expand" and created RTL files, which I passed to a PERL script called Egypt (which I believe is Graphviz/Dot) and generated a PDF file of the callgraph. This works perfectly, no problems at all.
Except, there are calls being made into some libraries that are getting shown as built-in. I was looking to see if there is a way for the callgraph not to be printed as and instead the real calls made into the libraries ?
Please let me know if the question is unclear.
http://i.imgur.com/sp58v.jpg
Basically, I am trying to avoid the callgraph from generating < built-in >
Is there a way to do that ?
-------- CODE ---------
#include <cilk/cilk.h>
#include <stdio.h>
#include <stdlib.h>
unsigned long int t0, t5;
unsigned int NOSPAWN_THRESHOLD = 32;
int fib_nospawn(int n)
{
if (n < 2)
return n;
else
{
int x = fib_nospawn(n-1);
int y = fib_nospawn(n-2);
return x + y;
}
}
// spawning fibonacci function
int fib(long int n)
{
long int x, y;
if (n < 2)
return n;
else if (n <= NOSPAWN_THRESHOLD)
{
x = fib_nospawn(n-1);
y = fib_nospawn(n-2);
return x + y;
}
else
{
x = cilk_spawn fib(n-1);
y = cilk_spawn fib(n-2);
cilk_sync;
return x + y;
}
}
int main(int argc, char *argv[])
{
int n;
long int result;
long int exec_time;
n = atoi(argv[1]);
NOSPAWN_THRESHOLD = atoi(argv[2]);
result = fib(n);
printf("%ld\n", result);
return 0;
}
I compiled the Cilk Library from source.
I might have found the partial solution to the problem:
You need to pass the following option to egypt
--include-external
This produced a slightly more comprehensive callgraph, although there still is the " visible
http://i.imgur.com/GWPJO.jpg?1
Can anyone suggest if I get more depth in the callgraph ?
You can use the GCC VCG Plugin: A gcc plugin, which can be loaded when debugging gcc, to show internal structures graphically.
gcc -fplugin=/path/to/vcg_plugin.so -fplugin-arg-vcg_plugin-cgraph foo.c
Call-graph is place to store data needed
for inter-procedural optimization. All datastructures
are divided into three components:
local_info that is produced while analyzing
the function, global_info that is result
of global walking of the call-graph on the end
of compilation and rtl_info used by RTL
back-end to propagate data from already compiled
functions to their callers.

Resources