Eigen3 Matrix-Matrix Multiplication 30 times faster than own openmp parallelized code - visual-c++

I compiled the code below on VS C++ 2017 with /openmp /O2 /arch::AVX.
When running with 8 threads the output is:
dt_loops = 1562ms
dt_eigen = 26 ms
I expected the A * B to be faster than my own handmade loops but I did not expect such a large difference. Is there anything wrong with my code? And if not how can Eigen3 do it so much faster.
I'm not very experienced in using OpenMP or any other parallelization method. I tried diferent loop orders but the one below is the fastest.
#include <iostream>
#include <chrono>
#include <Eigen/Dense>
int main() {
std::chrono::time_point<std::chrono::system_clock> start1, end1, start2, end2;
int n = 1000;
Eigen::MatrixXd A = Eigen::MatrixXd::Random(n, n);
Eigen::MatrixXd B = Eigen::MatrixXd::Random(n, n);
Eigen::MatrixXd C = Eigen::MatrixXd::Zero(n, n);
start1 = std::chrono::system_clock::now();
int i, j, k;
#pragma omp parallel for private(i, j, k)
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
for (k = 0; k < n; ++k) {
C(i, j) += A(i, k) * B(k, j);
}
}
}
end1 = std::chrono::system_clock::now();
std::cout << "dt_loops = " << std::chrono::duration_cast<std::chrono::milliseconds>(end1-start1).count() << " ms" << std::endl;
Eigen::MatrixXd D = Eigen::MatrixXd::Zero(n, n);
start2 = std::chrono::system_clock::now();
D = A * B;
end2 = std::chrono::system_clock::now();
std::cout << "dt_eigen = " << std::chrono::duration_cast<std::chrono::milliseconds>(end2-start2).count() << " ms" << std::endl;
}

Related

How to correctly calculate ACF in C++?

I would like to manually reproduce the method that authors of an article used in their research (DOI: 10.1038/s41598-017-02750-9 (Page 8. top)). It is mentioned as "ACF", so I wrote different functions:
1, a version based on a youtube video (https://youtu.be/ZjaBn93YPWo?t=417) using Alglibs Pearson correlation coefficient function
2, then another version based on the formula that is described in the article mentioned above
3, then another version based on the simplified formula described at an online ACF calculator page (https://planetcalc.com/7908/)
4, then a version based on the longer formula described there (https://planetcalc.com/7908/)
=> Yet, all of these give different output. However, method 3. is consistent with the output coming from the online calculator ran in my browser: https://planetcalc.com/7884/?d=.bTkjs.ymyQ8blXMoYiMgIOOmzhhI4fnckel.J5yEDWtV89Gz32Ch0kse2s
My code is here:
#include <iostream>
#define _WIN32_WINNT 0x0500
#include<windows.h>
//#include <cmath>
#include "alglib/alglibinternal.h"
#include "alglib/alglibmisc.h"
#include "alglib/ap.h"
#include "alglib/dataanalysis.h"
#include "alglib/diffequations.h"
#include "alglib/fasttransforms.h"
#include "alglib/integration.h"
#include "alglib/interpolation.h"
#include "alglib/linalg.h"
#include "alglib/optimization.h"
#include "alglib/solvers.h"
#include "alglib/specialfunctions.h"
#include "alglib/statistics.h"
#include "alglib/stdafx.h"
using namespace std;
double* normalize(double* _arr, int _s) {
double* output = new double[_s];
double mod = 0.0;
for (size_t i = 0; i < _s; ++i)
mod += _arr[i] * _arr[i];
double mag = sqrt(mod); //TODO: if 0, throw exc
double mag_inv = 1.0 / mag;
for (size_t i = 0; i < _s; ++i)
output[i] = _arr[i] * mag_inv;
return output;
}
void doACFyoutube(double* _ina, int _s)
// https://youtu.be/ZjaBn93YPWo?t=417 => the most unefficient, but understandable method
{
double* temp_x;
double* temp_y;
double* ACFoutput = new double[_s];
for(int shift = 0; shift < _s; shift++)
{
temp_x = new double[_s-shift];
temp_y = new double[_s-shift];
for(int cpy = 0; cpy < _s-shift; cpy++)
{
temp_x[cpy] = _ina[cpy];
temp_y[cpy] = _ina[cpy+shift];
}
temp_y = normalize(temp_y, _s-shift); //not sure if needed //TODO: leak
alglib::real_1d_array temp_x_alglib;
alglib::real_1d_array temp_y_alglib;
temp_x_alglib.setcontent(_s-shift, temp_x);
temp_y_alglib.setcontent(_s-shift, temp_y);
ACFoutput[shift] = alglib::pearsoncorr2(temp_x_alglib, temp_y_alglib); //Pearson product-moment correlation coefficient
delete temp_x;
delete temp_y;
}
for(int i=0; i<_s; i++)
cout << " lag = " << i << "\tACF(lag) = " << ACFoutput[i] << endl;
}
void doACFgoal(double* _ina, int _s)
// DOI: 10.1038/s41598-017-02750-9 => page 8, first equation (my goal is to reproduce this)
{
double mean = 0; //mean
for(int a = 0; a < _s; a++ )
mean += _ina[a];
mean /= _s;
double var = 0; //variance
for(int b = 0; b < _s; b++ )
var += (_ina[b]-mean)*(_ina[b]-mean);
var /= _s-1; //needed? (-1) a.k.a. Bessell's correction ?
double* ACFoutput = new double[_s];
for(int i = 0; i < _s; i++)
{
double temp_sum = 0;
for(int j = 1; j <= _s-i; j++)
temp_sum += (_ina[j]-mean)*(_ina[j+i]-mean);
ACFoutput[i] = (double)1/(((double)_s-(double)i)*var*var) * temp_sum;
}
for(int i=0; i<_s; i++)
cout << " lag = " << i << "\tACF(lag) = " << ACFoutput[i] << endl;
}
void doACFplanetcalcCoarse(double* _ina, int _s)
// https://planetcalc.com/7908/
{
double mean = 0; //mean
for(int a = 0; a < _s; a++ )
mean += _ina[a];
mean /= _s;
double* ACFoutput = new double[_s];
for(int i = 0; i < _s; i++)
{
double temp_sum1 = 0;
double temp_sum2 = 0;
for(int j = 0; j < _s-i; j++)
temp_sum1 += (_ina[j]-mean)*(_ina[j+i]-mean);
for(int k = 0; k < _s; k++)
temp_sum2 += (_ina[k]-mean)*(_ina[k]-mean);
ACFoutput[i] = temp_sum1 / temp_sum2;
}
for(int i=0; i<_s; i++)
cout << " lag = " << i << "\tACF(lag) = " << ACFoutput[i] << endl;
}
void doACFplanetcalcFine(double* _ina, int _s)
// https://planetcalc.com/7908/ => gives different output than the online calculator script, even though uses the longer formula described there
{
double* ACFoutput = new double[_s];
for(int k = 0; k < _s; k++)
{
double mean1 = 0; //mean of first N-k values
for(int a = 0; a < _s-k; a++ )
mean1 += _ina[a];
mean1 /= _s-k;
// cout << "\t mean of first N-" << k << " values = " << mean1 << endl;
double mean2 = 0; //mean of last N-k values
for(int a = k; a < _s; a++ )
mean2 += _ina[a];
mean2 /= _s-k;
// cout << "\t mean of last N-" << k << " values = " << mean2 << endl;
double temp_sum1 = 0;
double temp_sum2 = 0;
double temp_sum3 = 0;
for(int i = 0; i < _s-k; i++)
{
temp_sum1 += (_ina[i]-mean1)*(_ina[i+k]-mean2);
// cout << "\t\t temp_sum1 (" << i << ") = " << temp_sum1 << endl;
}
// cout << "\t temp_sum1 = " << temp_sum1 << endl;
for(int i = 0; i < _s-k; i++)
{
temp_sum2 += (_ina[i]-mean2)*(_ina[i]-mean2); //pow2
// cout << "\t\t temp_sum2 (" << i << ") = " << temp_sum2 << endl;
}
// cout << "\t temp_sum2 = " << temp_sum2 << endl;
for(int i = 0; i < _s-k; i++)
{
temp_sum3 += (_ina[i+k]-mean2)*(_ina[i+k]-mean2); //pow2
// cout << "\t\t temp_sum3 (" << i << ") = " << temp_sum3 << endl;
}
// cout << "\t temp_sum3 = " << temp_sum3 << endl;
ACFoutput[k] = temp_sum1 / (sqrt(temp_sum2)*sqrt(temp_sum3));
}
for(int i=0; i<_s; i++)
cout << " lag = " << i << "\tACF(lag) = " << ACFoutput[i] << endl;
}
int main()
{
//fullscreenhez
HWND hWnd = GetConsoleWindow();
ShowWindow(hWnd,SW_SHOWMAXIMIZED);
double ina[15] = {2,3,4,5,4,3,4,5,6,7,6,5,4,3,4}; //15 elem
for(int x=0; x<15; x++)
cout << ina[x] << ",";
cout << endl;
cout << endl;
// https://youtu.be/ZjaBn93YPWo?t=417 => the most unefficient, but understandable method
doACFyoutube(ina, 15); // ??? result doesn't match any other
cout << endl;
// DOI: 10.1038/s41598-017-02750-9 => page 8, first equation (my goal is to reproduce this)
doACFgoal(ina, 15); // ??? result doesn't match any other
cout << endl;
// https://planetcalc.com/7908/ (simplified formula)
doACFplanetcalcCoarse(ina, 15); //result equals to the online calculator result: https://planetcalc.com/7884/?_d=.bTkjs.ymyQ8blXMoYiMgIOOmzhhI4fnckel.J5yEDWtV89Gz32Ch0kse2s_
cout << endl;
// https://planetcalc.com/7908/ (longer formula)
doACFplanetcalcFine(ina, 15); // ??? result doesn't match any other
return 0;
}
The output looks like this:
As I do not have the original data they used in the publication, I can only rely on how consistent the output of my program is related to other codes output. But these outputs are different, and I do not know why. Could you please have a look at the code and help me end up in four equal outputs?
(Codeblocks project zipped here:
https://drive.google.com/file/d/1s3SeJSiDgk-hiMazp94HfFerL582VG2K/view?usp=sharing)

A OS homework problem about executing threads using semaphores

I'm doing my os homework, and the requirement is to implement parallel merge sort using Pthread and using semaphore to lock and unlock them.
You can only look at the function names Multi____ and ignore Single_____, because I've already finished the single thread part.
I encountered a problem in the multithreaded part. I signal the master thread (sem[1]) in line 227, and it should go into the function 'void *MultiPartition'.
In this function, it gives value to arg[id * 2] and arg[id * 2 + 1].
For example, arg[1] will gives value to arg[2] and arg[3], and then it signals thread[2] and thread[3] to start by sem_post.
And doesn't seem to work.
So I use cout << "partition id = " << id << ", head = " << head << ", mid = " << mid << ", tail = " << tail << "\n"; in line 111 to check out what happens.
It looks really weird. It sometimes outputs
partition id = 1, head = 0, mid = 7, tail = 15
partition id = 2, head = 0, mid = 3, tail = 7
and it was stuck, but the program didn't exit. Means I need to press Ctrl^C to exit program.
Sometimes it outputs
partition id = 1, head = 0, mid = 7, tail = 15
partition id = 2, head = 0, mid = 3, tail = 7
partition id = 3, head = 8, mid = 11, tail = 15
partition id = 4, head = 0, mid = 1, tail = 3
and was stuck, too.
I'm curious where other threads going?
And if id = 4 is displayed, it will runs bubble id = 8 usually.
#include <iostream>
#include <pthread.h>
#include <semaphore.h>
#include <fstream>
#include <sys/time.h>
#include <unistd.h>
using namespace std;
//Pthread_create, pthread_exit *don't use pthread_join
//sem_init, sem_wait, sem_post, sem_getvalue, sem_destroy
//Enter input file name: test.txt
//MT sorting used x secs
//ST sorting used x secs
// g++ -o os_hw3.out os_hw3.cpp -pthread
typedef struct{
int head;
int mid;
int tail;
int id;
}arguments;
//declare global variables
sem_t sem[16]; // use id = 1 ~ 15
sem_t final; // the final semaphore signal that indicate all threads finished.
int* s1, * s2; // two array for single and multiple
arguments arg[16];
void swap(int* x, int* y) {
int temp;
temp = *x;
*x = *y;
*y = temp;
}
void *MultiMerge(void* argid) {
int id = *(int*)argid;
sem_wait(&sem[id]);
sem_wait(&sem[id]);
int head = arg[id].head, mid = arg[id].mid, tail = arg[id].tail;
//cout << "merge id = " << id << ", head = " << head << ", mid = " << mid << ", tail = " << tail << "\n";
int lenA = mid - head + 1;
int lenB = tail - (mid + 1) + 1;
int A[lenA];
int B[lenB];
for (int i = 0; i < lenA; i++) {
A[i] = *(s1 + head + i);
}
for (int j = 0; j < lenB; j++) {
B[j] = *(s1 + mid + 1 + j);
}
int i = 0, j = 0, k = 0;
while (i < lenA && j < lenB) {
if (A[i] < B[j]) {
*(s1 + head + k) = A[i];
i++;
}
else {
*(s1 + head + k) = B[j];
j++;
}
k++;
}
while (i < lenA) {
*(s1 + head + k) = A[i];
i++;
k++;
}
while (j < lenA) {
*(s1 + head + k) = B[j];
j++;
k++;
}
sem_post(&sem[id / 2]); // signal the upper level
if (id == 1) {
fstream fout;
fout.open("output1.txt", ios::out);
for (i = 0; i < arg[1].tail + 1; i++)
fout << *(s2 + i);
fout.close();
sem_post(&final);
}
}
void *MultiBubble(void *argid) {
int id = *(int*)argid;
sem_wait(&sem[id]);
//cout << "bubble id = " << id << ", head = " << arg[id].head << ", tail = " << arg[id].tail << "\n";
for (int i = arg[id].tail; i > 0; --i) {
for (int j = arg[id].head; j < i; ++j) {
if (*(s2 + j) > * (s2 + j + 1)) {
swap((s2 + j), (s2 + j + 1));
}
}
}
for (int i = arg[id].head; i <= arg[id].tail; i++) {
cout << *(s2 + i) << " ";
}
cout << "\n";
sem_post(&sem[id / 2]);
}
void *MultiPartition(void* argid) {
int id = *(int*)argid;
sem_wait(&sem[id]);
int head = arg[id].head, mid = arg[id].mid, tail = arg[id].tail;
cout << "partition id = " << id << ", head = " << head << ", mid = " << mid << ", tail = " << tail << "\n";
arg[id * 2].head = arg[id].head;
arg[id * 2].tail = arg[id].mid;
arg[id * 2].mid = (arg[id * 2].head + arg[id * 2].tail) / 2;
arg[id * 2 + 1].head = arg[id].mid + 1;
arg[id * 2 + 1].tail = arg[id].tail;
arg[id * 2 + 1].mid = (arg[id * 2 + 1].head + arg[id * 2 + 1].tail) / 2;
sem_post(&sem[id * 2]);
sem_post(&sem[id * 2 + 1]);
}
void SingleMerge(int* s1, int head, int mid, int tail) {
int lenA = mid - head + 1;
int lenB = tail - (mid + 1) + 1;
int A[lenA];
int B[lenB];
for (int i = 0; i < lenA; i++) {
A[i] = *(s1 + head + i);
}
for (int j = 0; j < lenB; j++) {
B[j] = *(s1 + mid + 1 + j);
}
int i = 0, j = 0, k = 0;
while (i < lenA && j < lenB) {
if (A[i] < B[j]) {
*(s1 + head + k) = A[i];
i++;
}
else {
*(s1 + head + k) = B[j];
j++;
}
k++;
}
while (i < lenA) {
*(s1 + head + k) = A[i];
i++;
k++;
}
while (j < lenA) {
*(s1 + head + k) = B[j];
j++;
k++;
}
}
int SingleBubble(int* s1, int head, int tail) {
for (int i = tail; i > 0; --i) {
for (int j = head; j < i; ++j) {
if (*(s1 + j) > *(s1 + j + 1)) {
swap((s1 + j), (s1 + j + 1));
}
}
}
}
void SinglePartition(int* s1, int head, int tail, int times) {
if (head <= tail) {
int mid = (head + tail) / 2;
if (times < 3) {
SinglePartition(s1, head, mid, ++times);
SinglePartition(s1, mid + 1, tail, ++times);
}
else {
SingleBubble(s1, head, tail);
}
SingleMerge(s1, head, mid, tail);
}
}
int main() {
char filename[100];
int num;
struct timeval Tstart, Tend;
cout << "Enter the input file name: ";
cin >> filename;
fstream file, fout;
file.open(filename, ios::in);
if (!file) {
cout << "Read File Error.\n";
return -1;
}
else {
file >> num;
s1 = new int[num];
s2 = new int[num];
for (int i = 0; i < num; i++) {
file >> *(s1 + i);
*(s2 + i) = *(s1 + i);
}
file.close();
}
//SINGLE THREAD
gettimeofday(&Tstart, 0);
SinglePartition(s1, 0, num - 1, 0);
gettimeofday(&Tend, 0);
fout.open("output2.txt", ios::out);
for (int i = 0; i < num; i++)
fout << *(s1 + i) << " ";
fout.close();
double Tdifference = (Tend.tv_sec - Tstart.tv_sec) + (Tend.tv_usec - Tstart.tv_usec) / 1000000.0;
cout << "Single thread cost " << Tdifference << " s\n";
//MULTI THREAD
gettimeofday(&Tstart, 0);
arg[1].head = 0;
arg[1].tail = num - 1;
arg[1].mid = (arg[1].head + arg[1].tail) / 2;
pthread_t thread[16];
sem_init(&final, 0, 0);
sem_post(&sem[1]);
for (int i = 1; i < 16; i++){
arg[i].id = i;
sem_init(&sem[i], 0, 0);
if (i < 8) {
if(i == 1)
sem_post(&sem[1]); // call the master thread
pthread_create(&thread[i], NULL, MultiPartition, &arg[i].id);
}
else
pthread_create(&thread[i], NULL, MultiBubble, &arg[i].id);
}
for (int i = 7; i > 0; i--) {
pthread_create(&thread[i], NULL, MultiMerge, &arg[i].id);
}
sem_wait(&final);
gettimeofday(&Tend, 0);
Tdifference = (Tend.tv_sec - Tstart.tv_sec) + (Tend.tv_usec - Tstart.tv_usec) / 1000000.0;
cout << "Multi thread cost " << Tdifference << " s\n";
delete s1, s2;
for (int i = 0; i < 16; i++)
sem_destroy(&sem[i]);
sem_destroy(&final);
return 0;
}
I fixed it myself.
for (int i = 7; i > 0; i--) {
pthread_create(&thread[i], NULL, MultiMerge, &arg[i].id);
}
The above part will confront the following part.
for (int i = 1; i < 16; i++){
arg[i].id = i;
sem_init(&sem[i], 0, 0);
if (i < 8) {
if(i == 1)
sem_post(&sem[1]); // call the master thread
pthread_create(&thread[i], NULL, MultiPartition, &arg[i].id);
}
MultiMerge and MultiPartition both have
sem_wait(&sem[id]);
So if sem[id]'s value != 0, it doesn't know which function to do, in my opinion.
I delete
for (int i = 7; i > 0; i--) {
pthread_create(&thread[i], NULL, MultiMerge, &arg[i].id);
}
and add
MultiMerge(&arg[id].id);
in the bottom of MultiPartition to call MultiMerge, this can fixed my problem.

Getting segmentaition fault with subset dp problem

Given a set of numbers, check whether it can be partitioned into two subsets such that the sum of elements in both subsets is same or not
I am getting segmentation fault in C++(g++ 5.4) with a this problem.
This is where i submitted my solution in C++
https://practice.geeksforgeeks.org/problems/subset-sum-problem/0
I am checking if the array can be divided into two parts with equal sum. So I am just checking if there exists a subset with sum equal to half the sum of the array
I have implemented the below logic with dynamic programming
Let dp[i][j] denote yes or no whether a subset with sum j is possible to form with elements in the range [0, i](both inclusive) where i is 0-based index. I have done nothing new with this traditional problem. But I am getting segmentation fault. The program is giving correct output for small test cases. What mistake have I made
I haven't used any comments because I have done nothing new. Hope it is understandable.
#include <iostream>
#include <bits/stdc++.h>
#include<cstdio>
#define ll long long int
using namespace std;
bool isVowel(char c){
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
}
bool isLower(char c){
return 97 <= c && c <= 122;
}
int main() {
ios_base::sync_with_stdio(false);
cin.tie(NULL);
cout.tie(NULL);
cout << setprecision(10);
ll t, n;
cin >> t;
while (t--) {
cin >> n;
ll a[n];
ll sum = 0;
for (ll i = 0; i < n; i++) {
cin >> a[i];
sum += a[i];
}
if (sum % 2) {
cout << "NO" << '\n';
continue;
}
sum /= 2;
ll dp[n][sum + 1];
for (ll i = 0; i < n; i++) {
for(ll j = 0; j < sum + 1; j++) {
dp[i][j] = 0;
}
}
for (ll i = 0; i < n; i++) {
dp[i][a[i]] = 1;
dp[i][0] = 1;
}
for (ll i = 1; i < n; i++) {
for (ll j = 1; j < sum + 1; j++){
if (j - a[i] > 0) {
dp[i][j] = dp[i - 1][j - a[i]];
}
dp[i][j] |= dp[i - 1][j];
}
}
cout << (dp[n - 1][sum] ? "YES" : "NO") << '\n';
}
}
The segmentation fault is due to
ll dp[n][sum + 1];
Even though the constraints say 1 <= N<= 100, 0 <= arr[i]<= 1000, the test cases used are probably much larger, so ll dp[n][sum + 1] will end up taking some serious stack memory, use
bool dp[n][sum + 1];
It should work fine.
On a side note, avoid using ll randomly, use them according to the constraints.

Having trouble with pointers in C++

I am trying to access an array from inside of a function, but I get the
"Error C2065 'i': undeclared identifier." I know that I am making a mistake with the pointer. I was able to pull information from the array in the function below the one I'm having issues with, so I'm not sure why I am unable to do the same thing here. Thank you for your time.
#include <iostream>
#include <cmath>
using namespace std;
double mean(int size, int* numbers);
double sDeviation(int numOfScores, int average, int* scores);
int histogram(int numOfScores, int* scores); //<<<This is what I'm having trouble with
int main()
{
int count = 0;
int scores[100];
while (true)
{
int scoreToBeEntered;
cout << "Please enter a score: ";
cin >> scoreToBeEntered;
if(scoreToBeEntered == NULL)
cout << "No value entered" << endl;
else if(scoreToBeEntered != -1)
scores[count++] = scoreToBeEntered;
else
break;
}
for(int i = 9; i >= 0; i--)
cout << i << "|" << endl;
cout << "SD: " << sDeviation(count, mean(count, scores), scores) << endl;
system("pause");
return 0;
}
int histogram(int numOfScores, int* scores)//this is where the issue starts
{
int* bins = new int[10];
for(int i = 0; i < numOfScores; i++);
if(scores[i] >= 90) //<<<<This is the undeclared "i"
{
bins[9]++;
}
}
double sDeviation(int numOfScores, int average, int* scores)
{
double deviation = 0;
for (int i = 0; i < numOfScores; i++)
deviation += pow(scores[i] - average, 2);
return sqrt(deviation / numOfScores);
}
double mean(int size, int* numbers)
{
double sum = 0;
for (int i = 0; i < size; i++)
sum += numbers[i];
return sum / size;
}

Run-Time Check Failure #2 - Stack around the variable 'TP' was corrupted

I'm trying to do a pascal triangle but for some reason i have the error "Run-Time Check Failure #2 - Stack around the variable 'TP' was corrupted." Can someone help me, please?
#include <iostream>
using namespace std;
void main()
{
int TP[100][100] = { 0 }, n;
do
{
cout << "Digite a ordem do triangulo de pascal: ";
cin >> n;
} while (n < 0 || n > 100);
for (int j = 0; j < n; j++)
{
TP[j][0] = 1;
TP[j][j] = 1;
}
for (int i = 2; i < n + 1; i++)
for (int j = 1; TP[i][j] != 1; j++)
TP[i][j] = TP[i - 1][j - 1] + TP[i - 1][j];
for (int i = 0; i < n; i++)
{
cout << endl;
for (int j = 0; j != i + 1; j++)
cout << TP[i][j] << " ";
}
cout << endl << endl;
system("pause");
}
You go beyond the array boundaries in for (int i = 2; i < n + 1; i++) for i == n when n=100.

Resources