My aim is to solve a system of differential equations using Rcpp. Basically I want to set up a system as shown in the code below (modification of the code example found here: How to use C++ ODE solver with Rcpp in R?).
At the moment the code below integrates a set of odes in the time intervall 0 to 10. For the entire time params[0] is -100, and parms[1] = 10. However, my aim is to set up a system where parms[0] and parms[1] are only constant over a subset of the time intervall. E.g. for the time intervall 0-5 parms[0] should be set to 1 and for the remaining time parms[0]should be 10.
Actually, I have almost no experience in c++/rcpp. Thus, I have no idea how to set up such a system. Could you please give me a hint how I should construct the ode system. Thank you very much in advance for any advice how to solve this problem.
i save the code below in a cpp file and call it with sourceCpp in R:
#include <Rcpp.h>
#include <boost/array.hpp>
#include <boost/numeric/odeint.hpp>
// [[Rcpp::depends(BH)]]
using namespace Rcpp;
using namespace std;
using namespace boost::numeric::odeint;
typedef boost::array< double ,3 > state_type;
typedef boost::array< double ,2 > parms_type;
double time = 10;
parms_type parms = {-100, 10};
void rhs( const state_type &x , state_type &dxdt , const double t) {
dxdt[0] = parms[0]/(2.0*t*t) + x[0]/(2.0*t);
dxdt[1] = parms[1]/(2.0*t*t) + x[1]/(2.0*t);
dxdt[2] = parms[1]/(2.0*t*t) + x[1]/(2.0*t);
}
void write_cout( const state_type &x , const double t ) {
// use Rcpp's stream
Rcpp::Rcout << t << '\t' << x[0] << '\t' << x[1] << '\t' << x[2] << endl;
}
typedef runge_kutta_dopri5< state_type > stepper_type;
// [[Rcpp::export]]
bool boostExample() {
state_type x = { 1.0 , 1.0, 1.0 }; // initial conditions
integrate_adaptive(make_controlled( 1E-12 , 1E-12 , stepper_type () ) ,
rhs , x , 1.0 , time, 0.1 , write_cout );
return true;
}
Your code does not compile for me:
boost-ode.cpp:11:8: error: ‘double time’ redeclared as different kind of symbol
double time = 10.0;
^~~~
In file included from /usr/include/pthread.h:24:0,
from /usr/include/x86_64-linux-gnu/c++/6/bits/gthr-default.h:35,
from /usr/include/x86_64-linux-gnu/c++/6/bits/gthr.h:148,
from /usr/include/c++/6/ext/atomicity.h:35,
from /usr/include/c++/6/bits/basic_string.h:39,
from /usr/include/c++/6/string:52,
from /usr/include/c++/6/stdexcept:39,
from /usr/include/c++/6/array:39,
from /usr/include/c++/6/tuple:39,
from /usr/include/c++/6/unordered_map:41,
from /usr/local/lib/R/site-library/Rcpp/include/Rcpp/platform/compiler.h:153,
from /usr/local/lib/R/site-library/Rcpp/include/Rcpp/r/headers.h:48,
from /usr/local/lib/R/site-library/Rcpp/include/RcppCommon.h:29,
from /usr/local/lib/R/site-library/Rcpp/include/Rcpp.h:27,
from boost-ode.cpp:1:
/usr/include/time.h:192:15: note: previous declaration ‘time_t time(time_t*)’
extern time_t time (time_t *__timer) __THROW;
^~~~
I simply removed the global variable time and used an explicit 10.0 in it's place. I also removed the namespace usage of Rcpp and std. The former was not used anyway, the latter only in one place. Generally I try to avoid importing such large namespaces, especially two at the same time.
Anyway, one simple solution would be to introduce two param vectors and select in rhs the appropriate one based on the time:
#include <Rcpp.h>
#include <boost/array.hpp>
#include <boost/numeric/odeint.hpp>
// [[Rcpp::depends(BH)]]
using namespace boost::numeric::odeint;
typedef boost::array< double ,3 > state_type;
typedef boost::array< double ,2 > parms_type;
parms_type parms_begin = {1, 10};
parms_type parms_end = {10, 10};
void rhs( const state_type &x , state_type &dxdt , const double t) {
if (t < 5.0) {
dxdt[0] = parms_begin[0]/(2.0*t*t) + x[0]/(2.0*t);
dxdt[1] = parms_begin[1]/(2.0*t*t) + x[1]/(2.0*t);
dxdt[2] = parms_begin[1]/(2.0*t*t) + x[1]/(2.0*t);
} else {
dxdt[0] = parms_end[0]/(2.0*t*t) + x[0]/(2.0*t);
dxdt[1] = parms_end[1]/(2.0*t*t) + x[1]/(2.0*t);
dxdt[2] = parms_end[1]/(2.0*t*t) + x[1]/(2.0*t);
}
}
void write_cout( const state_type &x , const double t ) {
// use Rcpp's stream
Rcpp::Rcout << t << '\t' << x[0] << '\t' << x[1] << '\t' << x[2] << std::endl;
}
typedef runge_kutta_dopri5< state_type > stepper_type;
// [[Rcpp::export]]
bool boostExample() {
state_type x = { 1.0 , 1.0, 1.0 }; // initial conditions
integrate_adaptive(make_controlled( 1E-12 , 1E-12 , stepper_type () ) ,
rhs , x , 1.0 , 10.0, 0.1 , write_cout );
return true;
}
/*** R
boostExample()
*/
Related
In this simple example I would like to subset a matrix by row and pass it to another cpp function; the example demonstrates this works by passing an input array to the other function first.
#include "RcppArrayFire.h"
using namespace Rcpp;
af::array theta_check_cpp( af::array theta){
if(*theta(1).host<double>() >= 1){
theta(1) = 0;
}
return theta;
}
// [[Rcpp::export]]
af::array theta_check(RcppArrayFire::typed_array<f64> theta){
const int theta_size = theta.dims()[0];
af::array X(2, theta_size);
X(0, af::seq(theta_size)) = theta_check_cpp( theta );
X(1, af::seq(theta_size)) = theta;
// return X;
Rcpp::Rcout << " works till here";
return theta_check_cpp( X.row(1) );
}
/*** R
theta <- c( 2, 2, 2)
theta_check(theta)
*/
The constructor you are using to create X has an argument ty for the data type, which defaults to f32. Therefore X uses 32 bit floats and you cannot extract a 64 bit host pointer from that. Either use
af::array X(2, theta_size, f64);
to create an array using 64 bit doubles, or extract a 32 bit host pointer via
if(*theta(1).host<float>() >= 1){
...
I have been experimenting with the RcppArrayFire Package, mostly rewriting some cost functions from RcppArmadillo and can't seem to get over "no viable conversion from 'af::array' to 'float'. I have also been getting some backend errors, the example below seems free of these.
This cov-var example is written poorly just to use all relevant coding pieces from my actual cost function. As of now it is the only addition in a package generated by, "RcppArrayFire.package.skeleton".
#include "RcppArrayFire.h"
#include <Rcpp.h>
// [[Rcpp::depends(RcppArrayFire)]]
// [[Rcpp::export]]
float example_ols(const RcppArrayFire::typed_array<f32>& X_vect, const RcppArrayFire::typed_array<f32>& Y_vect){
int Len = X_vect.dims()[0];
int Len_Y = Y_vect.dims()[0];
while( Len_Y < Len){
Len --;
}
float mean_X = af::sum(X_vect)/Len;
float mean_Y = af::sum(Y_vect)/Len;
RcppArrayFire::typed_array<f32> temp(Len);
RcppArrayFire::typed_array<f32> temp_x(Len);
for( int f = 0; f < Len; f++){
temp(f) = (X_vect(f) - mean_X)*(Y_vect(f) - mean_Y);
temp_x(f) = af::pow(X_vect(f) -mean_X, 2);
}
return af::sum(temp)/af::sum(temp_x);
}
/*** R
X <- 1:10
Y <- 2*X +rnorm(10, mean = 0, sd = 1)
example_ols(X, Y)
*/
The first thing to consider is the af::sum function, which comes in different forms: An sf::sum(af::array) that returns an af::array in device memory and a templated af::sum<T>(af::array) that returns a T in host memory. So the minimal change to your example would be using af::sum<float>:
#include "RcppArrayFire.h"
#include <Rcpp.h>
// [[Rcpp::depends(RcppArrayFire)]]
// [[Rcpp::export]]
float example_ols(const RcppArrayFire::typed_array<f32>& X_vect,
const RcppArrayFire::typed_array<f32>& Y_vect){
int Len = X_vect.dims()[0];
int Len_Y = Y_vect.dims()[0];
while( Len_Y < Len){
Len --;
}
float mean_X = af::sum<float>(X_vect)/Len;
float mean_Y = af::sum<float>(Y_vect)/Len;
RcppArrayFire::typed_array<f32> temp(Len);
RcppArrayFire::typed_array<f32> temp_x(Len);
for( int f = 0; f < Len; f++){
temp(f) = (X_vect(f) - mean_X)*(Y_vect(f) - mean_Y);
temp_x(f) = af::pow(X_vect(f) -mean_X, 2);
}
return af::sum<float>(temp)/af::sum<float>(temp_x);
}
/*** R
set.seed(1)
X <- 1:10
Y <- 2*X +rnorm(10, mean = 0, sd = 1)
example_ols(X, Y)
*/
However, there are more things one can improve. In no particular order:
You don't need to include Rcpp.h.
There is an af::mean function for computing the mean of an af::array.
In general RcppArrayFire::typed_array<T> is only needed for getting arrays from R into C++. Within C++ and for the way back you can use af::array.
Even when your device does not support double, you can still use double values on the host.
In order to get good performance, you should avoid for loops and use vectorized functions, just like in R. You have to impose equal dimensions for X and Y, though.
Interestingly I get a different result when I use vectorized functions. Right now I am not sure why this is the case, but the following form makes more sense to me. You should verify that the result is what you want to get:
#include <RcppArrayFire.h>
// [[Rcpp::depends(RcppArrayFire)]]
// [[Rcpp::export]]
double example_ols(const RcppArrayFire::typed_array<f32>& X_vect,
const RcppArrayFire::typed_array<f32>& Y_vect){
double mean_X = af::mean<double>(X_vect);
double mean_Y = af::mean<double>(Y_vect);
af::array temp = (X_vect - mean_X) * (Y_vect - mean_Y);
af::array temp_x = af::pow(X_vect - mean_X, 2.0);
return af::sum<double>(temp)/af::sum<double>(temp_x);
}
/*** R
set.seed(1)
X <- 1:10
Y <- 2*X +rnorm(10, mean = 0, sd = 1)
example_ols(X, Y)
*/
BTW, an even shorter version would be:
#include <RcppArrayFire.h>
// [[Rcpp::depends(RcppArrayFire)]]
// [[Rcpp::export]]
af::array example_ols(const RcppArrayFire::typed_array<f32>& X_vect,
const RcppArrayFire::typed_array<f32>& Y_vect){
return af::cov(X_vect, Y_vect) / af::var(X_vect);
}
Generally it is a good idea to use the in-build functions as much as possible.
I've recently learned about false sharing, which in my understanding stems from the CPU's attempt to create cache coherence between different cores.
However, doesn't the following example demonstrate that cache coherence is violated?
The example below launches several threads that increase a global variable x, and several threads that assign the value of x to y, and an observer that tests if y>x. The condition y>x should never happen if there was memory coherence between the cores, as y is only increased after x was increased. However, this condition does happen according to the results of running this program. I tested it on visual studio both 64 and 86, both debug and release with pretty much the same results.
So, does memory coherence only happen when it's bad and never when it's good? :)
Please explain how cache coherence works and how it doesn't work. If you can guide me to a book that explains the subject I'll be grateful.
edit: I've added mfence where ever possible, still there is no memory coherence (presumably due to stale cache).
Also, I know the program has a data race, that's the whole point. My question is: Why is there a data race if the cpu maintains cache coherence (if it wasn't maintaining cache coherence, then what is false sharing and how does it happen?). Thank you.
#include <intrin.h>
#include <windows.h>
#include <iostream>
#include <thread>
#include <atomic>
#include <list>
#include <chrono>
#include <ratio>
#define N 1000000
#define SEPARATE_CACHE_LINES 0
#define USE_ATOMIC 0
#pragma pack(1)
struct
{
__declspec (align(64)) volatile long x;
#if SEPARATE_CACHE_LINES
__declspec (align(64))
#endif
volatile long y;
} data;
volatile long &g_x = data.x;
volatile long &g_y = data.y;
int g_observed;
std::atomic<bool> g_start;
void Observer()
{
while (!g_start);
for (int i = 0;i < N;++i)
{
_mm_mfence();
long y = g_y;
_mm_mfence();
long x = g_x;
_mm_mfence();
if (y > x)
{
++g_observed;
}
}
}
void XIncreaser()
{
while (!g_start);
for (int i = 0;i < N;++i)
{
#if USE_ATOMIC
InterlockedAdd(&g_x,1);
#else
_mm_mfence();
int x = g_x+1;
_mm_mfence();
g_x = x;
_mm_mfence();
#endif
}
}
void YAssigner()
{
while (!g_start);
for (int i = 0;i < N;++i)
{
#if USE_ATOMIC
long x = g_x;
InterlockedExchange(&g_y, x);
#else
_mm_mfence();
int x = g_x;
_mm_mfence();
g_y = x;
_mm_mfence();
#endif
}
}
int main()
{
using namespace std::chrono;
g_x = 0;
g_y = 0;
g_observed = 0;
g_start = false;
const int NAssigners = 4;
const int NIncreasers = 4;
std::list<std::thread> threads;
for (int i = 0;i < NAssigners;++i)
{
threads.emplace_back(YAssigner);
}
for (int i = 0;i < NIncreasers;++i)
{
threads.emplace_back(XIncreaser);
}
threads.emplace_back(Observer);
auto tic = high_resolution_clock::now();
g_start = true;
for (std::thread& t : threads)
{
t.join();
}
auto toc = high_resolution_clock::now();
std::cout << "x = " << g_x << " y = " << g_y << " number of times y > x = " << g_observed << std::endl;
std::cout << "&x = " << (int*)&g_x << " &y = " << (int*)&g_y << std::endl;
std::chrono::duration<double> t = toc - tic;
std::cout << "time elapsed = " << t.count() << std::endl;
std::cout << "USE_ATOMIC = " << USE_ATOMIC << " SEPARATE_CACHE_LINES = " << SEPARATE_CACHE_LINES << std::endl;
return 0;
}
Example output:
x = 1583672 y = 1583672 number of times y > x = 254
&x = 00007FF62BE95800 &y = 00007FF62BE95804
time elapsed = 0.187785
USE_ATOMIC = 0 SEPARATE_CACHE_LINES = 0
False sharing is mainly related to performance, not coherence or program order. The cpu cache works on a granularity which is typically 16, 32, 64,... bytes. That means if two independent data items are close together in memory, they will experience each others cache operations. Specifically, if &a % CACHE_LINE_SIZE == &b % CACHE_LINE_SIZE, then they will share a cache line.
For example, if cpu0 & 1 are fighting over a, and cpu 2 & 3 are fighting over b, the cache line containing a & b will thrash between each of the 4 caches. This is the effect of false sharing, and it causes a large performance drop.
False sharing happens because the coherence algorithm in the caches demand that there is a consistent view of memory. A good way to examine it is to put two atomic counters in a structure spaced out by one or two k:
struct a {
long a;
long pad[1024];
long b;
};
and find a nice little machine language function to do an atomic increment. Then cut loose NCPU/2 threads incrementing a and NCPU/2 threads incrementing b until they reach a big number.
Then repeat, commenting out the pad array. Compare the times.
When you are trying to get at machine details, clarity and precision are your friends; C++ and weird attribute declarations aren’t.
I created a cumsum function in an R package with rcpp which will cumulatively sum a vector until it hits the user defined ceiling or floor. However, if one wants the cumsum to be bounded above, the user must still specify a floor.
Example:
a = c(1, 1, 1, 1, 1, 1, 1)
If i wanted to cumsum a and have an upper bound of 3, I could cumsum_bounded(a, lower = 1, upper = 3). I would rather not have to specify the lower bound.
My code:
#include <Rcpp.h>
#include <float.h>
#include <cmath>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector cumsum_bounded(NumericVector x, int upper, int lower) {
NumericVector res(x.size());
double acc = 0;
for (int i=0; i < x.size(); ++i) {
acc += x[i];
if (acc < lower) acc = lower;
else if (acc > upper) acc = upper;
res[i] = acc;
}
return res;
}
What I would like:
#include <Rcpp.h>
#include <float.h>
#include <cmath>
#include <climits> //for LLONG_MIN and LLONG_MAX
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector cumsum_bounded(NumericVector x, long long int upper = LLONG_MAX, long long int lower = LLONG_MIN) {
NumericVector res(x.size());
double acc = 0;
for (int i=0; i < x.size(); ++i) {
acc += x[i];
if (acc < lower) acc = lower;
else if (acc > upper) acc = upper;
res[i] = acc;
}
return res;
}
In short, yes its possible but it requires finesse that involves creating an intermediary function or embedding sorting logic within the main function.
In long, Rcpp attributes only supports a limit feature set of values. These values are listed in the Rcpp FAQ 3.12 entry
String literals delimited by quotes (e.g. "foo")
Integer and Decimal numeric values (e.g. 10 or 4.5)
Pre-defined constants including:
Booleans: true and false
Null Values: R_NilValue, NA_STRING, NA_INTEGER, NA_REAL, and NA_LOGICAL.
Selected vector types can be instantiated using the
empty form of the ::create static member function.
CharacterVector, IntegerVector, and NumericVector
Matrix types instantiated using the rows, cols constructor Rcpp::Matrix n(rows,cols)
CharacterMatrix, IntegerMatrix, and NumericMatrix)
If you were to specify numerical values for LLONG_MAX and LLONG_MIN this would meet the criteria to directly use Rcpp attributes on the function. However, these values are implementation specific. Thus, it would not be ideal to hardcode them. Thus, we have to seek an outside solution: the Rcpp::Nullable<T> class to enable the default NULL value. The reason why we have to wrap the parameter type with Rcpp::Nullable<T> is that NULL is a very special and can cause heartache if not careful.
The NULL value, unlike others on the real number line, will not be used to bound your values in this case. As a result, it is the perfect candidate to use on the function call. There are two choices you then have to make: use Rcpp::Nullable<T> as the parameters on the main function or create a "logic" helper function that has the correct parameters and can be used elsewhere within your application without worry. I've opted for the later below.
#include <Rcpp.h>
#include <float.h>
#include <cmath>
#include <climits> //for LLONG_MIN and LLONG_MAX
using namespace Rcpp;
NumericVector cumsum_bounded_logic(NumericVector x,
long long int upper = LLONG_MAX,
long long int lower = LLONG_MIN) {
NumericVector res(x.size());
double acc = 0;
for (int i=0; i < x.size(); ++i) {
acc += x[i];
if (acc < lower) acc = lower;
else if (acc > upper) acc = upper;
res[i] = acc;
}
return res;
}
// [[Rcpp::export]]
NumericVector cumsum_bounded(NumericVector x,
Rcpp::Nullable<long long int> upper = R_NilValue,
Rcpp::Nullable<long long int> lower = R_NilValue) {
if(upper.isNotNull() && lower.isNotNull()){
return cumsum_bounded_logic(x, Rcpp::as< long long int >(upper), Rcpp::as< long long int >(lower));
} else if(upper.isNull() && lower.isNotNull()){
return cumsum_bounded_logic(x, LLONG_MAX, Rcpp::as< long long int >(lower));
} else if(upper.isNotNull() && lower.isNull()) {
return cumsum_bounded_logic(x, Rcpp::as< long long int >(upper), LLONG_MIN);
} else {
return cumsum_bounded_logic(x, LLONG_MAX, LLONG_MIN);
}
// Required to quiet compiler
return x;
}
Test Output
cumsum_bounded(a, 5)
## [1] 1 2 3 4 5 5 5
cumsum_bounded(a, 5, 2)
## [1] 2 3 4 5 5 5 5
I wrote a program to calculate the eigen-values of a 2-by-2 random matrix. I generated 50,000 2x2 random matrices and computed their eigen-values.
With boost, I used multi-thread in the member function getEigVal() of myClass, but I found that the CPU utilization is only 35%.
How can I speed up the process of getEigVal() with multi-threading?
#define _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread.hpp>
#include <boost/thread/future.hpp>
#include <vector>
#include <cmath>
#include <random>
#include <complex>
#include <chrono>
using namespace std;
using namespace std::chrono;
class myClass {
private:
int numOfRun;
double var;
vector <vector<complex<double>>> eigVal;
vector<complex<double>> quad_root(double a, double b, double c) {//quadratic formula
vector<complex<double>> root(2, complex<double>(0, 0));
complex<double> delta = sqrt(complex<double>(pow(b, 2) - 4 * a*c, 0));
root[0] = (-b + delta) / 2.0 / a;
root[1] = (-b - delta) / 2.0 / a;
return root;
}
vector<complex<double>> eig(vector<vector<double>> A) {//compute eigenvalues
double a = 1.0;
double b = -A[0][0] - A[1][1];
double c = A[0][0] * A[1][1] - A[0][1] * A[1][0];
vector<complex<double>> r = quad_root(a, b, c);
return r;
}
public:
myClass(int run = 5e4, double v = 1) :
numOfRun(run), var(v), eigVal(numOfRun, vector<complex<double>>(2)){
}
vector <vector<complex<double>>> getEigVal() {
random_device rd;
mt19937 e2(rd());
normal_distribution<> a(0.0, var);
vector <vector<double>> A(2, vector<double>(2));
for (int i = 0; i < numOfRun; i++) {
A = { { a(e2), a(e2) }, { a(e2), a(e2) } };//generate a 2x2 random matrix
boost::packaged_task<vector<complex<double>>> task{ bind(&myClass::eig, this, A) };
boost::future<vector<complex<double>>> f = task.get_future();
boost::thread t{ std::move(task) };
eigVal[i] = f.get();
}
return eigVal;
}
};
int main() {
myClass Test;
auto start = steady_clock::now();
vector <vector<complex<double>>> result = Test.getEigVal();
auto end = steady_clock::now();
cout << "Time elapsed: " << (duration_cast<milliseconds>(end - start).count())/1e3 << " seconds\n";//13.826 s
return 0;
}