How to increase speed of large for loops

How to increase speed of large for loops - multithreading

Right now i'm trying to run very large for loops for some task, nearly about 8e+12 iterations. I tried using c++11 threading, but it do not seems to be working that fast as required. I am using system with 8 gb ram, i5 cpu and intel graphics 4000 card. If i use openmp would it be better or i have to use nvidia gpu and use cuda for this task? My code is as below:
#include <ros/ros.h>
// PCL specific includes
#include <sensor_msgs/PointCloud2.h>
#include <pcl_conversions/pcl_conversions.h>
#include <pcl/point_types.h>
#include <pcl/filters/voxel_grid.h>
#include <visualization_msgs/Marker.h>
#include <rosbag/bag.h>
#include <std_msgs/Int32.h>
#include <rosbag/view.h>
#include <boost/foreach.hpp>
#define foreach BOOST_FOREACH
#include <fstream>
#include <pcl/point_cloud.h>
#include <pcl/octree/octree_pointcloud_changedetector.h>
#include <pcl/io/pcd_io.h>
#include <iostream>
#include <vector>
#include <ctime>
#include <thread>
ros::Publisher marker_publisher;
int frame_index = 0;
using namespace std;
int x[200000];
void thread_function(pcl::PointCloud<pcl::PointXYZRGB>::ConstPtr cloudB,vector<int> v,int p0) {
for(size_t p1=0;p1<v.size() && ros::ok();++p1) {
int p0p1 = sqrt( pow(cloudB->points[v[p1]].x-cloudB->points[v[p0]].x,2)
+pow(cloudB->points[v[p1]].y-cloudB->points[v[p0]].y,2)
+pow(cloudB->points[v[p1]].z-cloudB->points[v[p0]].z,2) ) * 1000;
if(p0p1>10) {
for(size_t p2=0;p2<v.size() && ros::ok();++p2) {
int p0p2 = sqrt( pow(cloudB->points[v[p2]].x-cloudB->points[v[p0]].x,2)
+pow(cloudB->points[v[p2]].y-cloudB->points[v[p0]].y,2)
+pow(cloudB->points[v[p2]].z-cloudB->points[v[p0]].z,2) ) * 1000;
int p1p2 = sqrt( pow(cloudB->points[v[p2]].x-cloudB->points[v[p1]].x,2)
+pow(cloudB->points[v[p2]].y-cloudB->points[v[p1]].y,2)
+pow(cloudB->points[v[p2]].z-cloudB->points[v[p1]].z,2) ) * 1000;
if(p0p2>10 && p1p2>10) {
}
}
}
}
x[p0] = 3;
cout<<"ended thread="<<p0<<endl;
}
void cloud_cb (const sensor_msgs::PointCloud2ConstPtr& input)
{
frame_index++;
pcl::PointCloud<pcl::PointXYZRGB>::Ptr cloudB (new pcl::PointCloud<pcl::PointXYZRGB> );
pcl::fromROSMsg(*input,*cloudB);
// Initializing Marker parameters which will be used in rviz
vector<visualization_msgs::Marker> line_list, marker, text_view_facing;
line_list.resize(4); marker.resize(4); text_view_facing.resize(4);
for(int i=0;i<line_list.size();i++) {
marker[i].header.frame_id = line_list[i].header.frame_id = text_view_facing[i].header.frame_id = "/X3/base_link";
marker[i].header.stamp = line_list[i].header.stamp = text_view_facing[i].header.stamp =ros::Time();
marker[i].ns = line_list[i].ns = text_view_facing[i].ns ="lines";
marker[i].action = line_list[i].action = text_view_facing[i].action = visualization_msgs::Marker::ADD;
marker[i].pose.orientation.w = line_list[i].pose.orientation.w = text_view_facing[i].pose.orientation.w = 1;
marker[i].id = i+4;
line_list[i].id = i;
marker[i].type = visualization_msgs::Marker::POINTS;
line_list[i].type = visualization_msgs::Marker::LINE_LIST;
line_list[i].color.r = 1; line_list[i].color.g = 1; line_list[i].color. b = 1; line_list[i].color.a = 1;
marker[i].scale.x = 0.003;
marker[i].scale.y = 0.003;
marker[i].scale.z = 0.003;
text_view_facing[i].id = i+8;
text_view_facing[i].type = visualization_msgs::Marker::TEXT_VIEW_FACING;
text_view_facing[i].color.b = 1; text_view_facing[i].color.a = 1.0; text_view_facing[i].color.g = 1.0; text_view_facing[i].color.r = 1.0;
text_view_facing[i].scale.z = 0.015;
}
marker[3].scale.x = 0.05;
marker[3].scale.y = 0.05;
marker[3].scale.z = 0.05;
if(frame_index==10) // Saving the point cloud for only one time to find moved object in it
{
pcl::io::savePCDFileASCII ("test_pcd.pcd", *cloudB);
}
if(frame_index>10) // Reading above point cloud file after saving for once to compare it with newly arriving point clouds
{
pcl::PointCloud<pcl::PointXYZRGB>::Ptr cloud (new pcl::PointCloud<pcl::PointXYZRGB>);
if (pcl::io::loadPCDFile<pcl::PointXYZRGB> ("test_pcd.pcd", *cloud) == -1) //* load the file
{
PCL_ERROR ("Couldn't read file test_pcd.pcd \n");
}
else {
srand ((unsigned int) time (NULL));
// Octree resolution - side length of octree voxels
double resolution = 0.1;
// Instantiate octree-based point cloud change detection class
pcl::octree::OctreePointCloudChangeDetector<pcl::PointXYZRGB> octree (resolution);
// Add points from cloudA to octree
octree.setInputCloud (cloud);
octree.addPointsFromInputCloud ();
// Switch octree buffers: This resets octree but keeps previous tree structure in memory.
octree.switchBuffers ();
// Add points from cloudB to octree
octree.setInputCloud (cloudB);
octree.addPointsFromInputCloud ();
std::vector<int> newPointIdxVector;
// Get vector of point indices from octree voxels which did not exist in previous buffer
octree.getPointIndicesFromNewVoxels (newPointIdxVector);
geometry_msgs::Point p; std_msgs::ColorRGBA c;
for (size_t i = 0; i < newPointIdxVector.size (); ++i)
{
p.x = cloudB->points[newPointIdxVector[i]].x;
p.y = cloudB->points[newPointIdxVector[i]].y;
p.z = cloudB->points[newPointIdxVector[i]].z;
c.r = cloudB->points[newPointIdxVector[i]].r/255.0;
c.g = cloudB->points[newPointIdxVector[i]].g/255.0;
c.b = cloudB->points[newPointIdxVector[i]].b/255.0;
c.a = 1;
//cout<<newPointIdxVector.size()<<"\t"<<p.x<<"\t"<<p.y<<"\t"<<p.z<<endl;
if(!isnan(p.x) && !isnan(p.y) && !isnan(p.z)) {
marker[3].points.push_back(p);
marker[3].colors.push_back(c);
}
}
marker_publisher.publish(marker[3]);
pcl::PointCloud<pcl::PointXYZRGB> P;
thread t[newPointIdxVector.size()];
for(int p0=0;p0<newPointIdxVector.size();++p0) { // For each voxel in moved object
t[p0] = thread(thread_function,cloudB,newPointIdxVector,p0);
}
for(int p0=0;p0<newPointIdxVector.size();++p0) { // For each voxel in moved object
t[p0].join();
cout<<"joined"<<"\t"<<p0<<"\t"<<x[p0]<<endl;
}
}
}
}
int main (int argc, char** argv)
{
ros::init (argc, argv, "training");
ros::NodeHandle nh;
ros::Subscriber sub = nh.subscribe<sensor_msgs::PointCloud2> ("input", 1, cloud_cb);
marker_publisher = nh.advertise<visualization_msgs::Marker> ("visualization_marker",1);
// Spin
ros::spin ();
}
This task is really important for my algorithm to complete. I need a suggestion how to make this loops run very fast.
In above code the thread_function is the main function where i'm putting the for loops currentely. Is their any way to increase its performance in above code?

OpenMP is the easiest to implement and try. Just add a couple of lines at your CMakeLists.txt, an include and the famous #pragma omp parallel for line just before your for loop.

Threading itself is not necessarily a guarantee for speed. If your process is mostly linear, there is nothing to be done in parallel. In your case, it looks like you have a loop and each iteration might be able to be done independently in parallel, but because each loop is so small and mostly simple mathematical operations, the overhead for making each item its own thread might not save you much (if any) time. The algorithm itself might need an overhaul (i.e. doing this an entirely different way), but threading could potentially solve your issue if your loop is huge and you can break it into, say, 4 chunks and parallel process the 4 chunks (i.e. one thread does items 0-100, another 101-200, etc). Just be aware that one process might finish before another and if some other process is relying on the completion of the whole set of data, then you'll need to ensure that you're done with all 4 threads before continuing. And if you do any kind of manipulation of the data (i.e. shifting elements, adding, removing) in the parallel processes, then you could end up screwing up a parallel thread. Hope that helps!

Related

Missing closing brace in SFML program?

I made a program using C++ and SFML. The program is supposed to generate 20 circles that are either red or blue, and it did work. Yet, I made a few changes, saved, and came back to it on VS a few hours later to find that I keep getting an error:
'{': No matching token found (Line 9)
I keep scanning through the code and I can't seem to find the issue at all.
Code:
#include <SFML/Graphics.hpp>
#include <iostream>
#include <chrono>
#include <random>
using namespace std;
int main()
{ //Line 9
unsigned seed = chrono::system_clock::now().time_since_epoch().count();
default_random_engine generator(seed);
uniform_int_distribution<int> distribution1(0, 1024);
uniform_int_distribution<int> distribution2(1, 2);
sf::RenderWindow window(sf::VideoMode(1024, 1024), "Spooky Circle Box");
sf::CircleShape shape(100.f);
shape.setFillColor(sf::Color::Red);
shape.setPosition(10, 10);
std::vector<sf::CircleShape> circles(20);
window.clear();
for (unsigned int i = 0; i < circles.size(); i++) {
int find = 0;
int find_color = 0;
while (find != 20) {
circles[i].setPosition(distribution1(generator), distribution1(generator));
for (unsigned int j = 0; j < circles.size(); j++) {
if (i == j || (circles[i].getPosition().x != circles[j].getPosition().x || circles[i].getPosition().y != circles[j].getPosition().y)) {
find++;
} else;
if (find != 20) {
find = 0;
} else;
}
find = 0;
find_color = distribution2(generator);
circles[i].setRadius(5.f);
if (find_color == 1) {
circles[i].setFillColor(sf::Color::Blue);
} else { circles[i].setFillColor(sf::Color::Red); }
window.draw(circles[i]);
}
window.display();
while (window.isOpen()) {
sf::sleep((sf::milliseconds(100)));
sf::Event event;
while (window.pollEvent(event))
{
if (event.type == sf::Event::Closed)
window.close();
}
}
return 0;
}

I explain further my comments, but I'm not going to post any repaired code. I only suggest a way of doing things.
By the structure of you're code, it seems you're trying to generate some blue or red circles randomly distributed over the window, but, at the same time, you're trying to draw them.
You should differentiate your actual data from your drawing stuff. My suggested pseudo-code would be.
int main(){int main(){
// 1 . Declare your circle vector
// 2 . Populate that vector with random circles (random position, random color)
// Now draw those circles
// 3 . while(window.isOpen()) loop
// 3.1 Clear the window
// 3.2 Draw your circles
// 3.3 Display the stuff
}
That point 3 it's basically the way to draw stuff acording SFML tutorials.

Qt C++ Displaying images outside the GUI thread (Boost thread)

I am developing a C++ library realizing its interface by means of Qt, using VS2015. On the library side, 3 boost threads continously load images from 3 folders. I am trying to display these images in 3 different QLabel (or equivalent QWidgets), so the thread body consists of this functionality,
in particular by exploiting the setPixmap method. Although the call to the function is protected by a boost mutex, I got exceptions probably due to threads synchronization. Looking for a solution, I already awared that the QPixmap widget is not "thread-safe" (non-reentrant). I also tried to use QGraphicsView but it in turn relies on QPixmap, thus I came across the same problem.
So my question is: does an alternative to QPixmap exist to display images in Qt in a thread-safe
manner?

I would recommend to do not multi-threading in GUI programming. Although, Qt provides multi-threading support in general, IMHO, the widgets are not well-prepared for this.
Thus, to achieve image loaders which run concurrently in separate threads I would suggest the following concept:
Each threaded image loader feeds a private buffer. The GUI inspects from time to time (using QTimer) these buffers and updates its QPixmap. As access to buffers should be possible from the resp. image loader thread as well as the GUI thread they have to be mutex guarded, of course.
My sample code testLoadImageMT.cc:
#include <atomic>
#include <chrono>
#include <mutex>
#include <thread>
#include <QtWidgets>
// manually added types (normally provided by glib)
typedef unsigned guint;
typedef unsigned char guint8;
// the fluffy-cat image sample
struct Image {
guint width;
guint height;
guint bytes_per_pixel; /* 3:RGB, 4:RGBA */
guint8 pixel_data[1];
};
extern "C" const Image fluffyCat;
class ImageLoader {
private:
const Image &_img;
std::atomic<bool> _exit;
std::mutex _lock;
QImage _qImg;
std::thread _thread;
public: // main thread API
ImageLoader(const Image &img = fluffyCat):
_img(img),
_qImg(img.width, img.height, QImage::Format_RGB888),
_exit(false), _thread(&ImageLoader::loadImage, std::ref(*this))
{ }
~ImageLoader()
{
_exit = true;
_thread.join();
}
ImageLoader(const ImageLoader&) = delete;
void applyImage(QLabel &qLblImg)
{
std::lock_guard<std::mutex> lock(_lock);
qLblImg.setPixmap(QPixmap::fromImage(_qImg));
}
private: // thread private
void loadImage()
{
for (;;) {
{ std::lock_guard<std::mutex> lock(_lock);
_qImg.fill(0);
}
size_t i = 0;
for (int y = 0; y < (int)_img.height; ++y) {
for (int x = 0; x < (int)_img.width; ++x) {
const quint32 value
= _img.pixel_data[i + 2]
| (_img.pixel_data[i + 1] << 8)
| (_img.pixel_data[i + 0] << 16)
| (0xff << 24);
i += _img.bytes_per_pixel;
{ std::lock_guard<std::mutex> lock(_lock);
_qImg.setPixel(x, y, value);
}
if (_exit) return; // important: make thread co-operative
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // slow down CPU cooler
}
}
}
};
int main(int argc, char **argv)
{
// settings:
enum { N = 3 }; // number of images loaded/displayed
enum { Interval = 50 }; // update rate for GUI 50 ms -> 20 Hz (round about)
// build appl.
qDebug() << "Qt Version: " << QT_VERSION_STR;
QApplication app(argc, argv);
// build GUI
QWidget qMainWin;
QVBoxLayout qVBox;
QLabel *pQLblImgs[N];
for (int i = 0; i < N; ++i) {
qVBox.addWidget(
new QLabel(QString::fromUtf8("Image %1").arg(i + 1)));
qVBox.addWidget(
pQLblImgs[i] = new QLabel());
}
qMainWin.setLayout(&qVBox);
qMainWin.show();
// build image loaders
ImageLoader imgLoader[N];
// install timer
QTimer qTimer;
qTimer.setInterval(Interval); // ms
QObject::connect(&qTimer, &QTimer::timeout,
[&imgLoader, &pQLblImgs]() {
for (int i = 0; i < N; ++i) {
imgLoader[i].applyImage(*pQLblImgs[i]);
}
});
qTimer.start();
// exec. application
return app.exec();
}
Sorry, I used std::thread instead of boost::thread as I've no experience with the latter, nor a working installation. I believe (hope) the differences will be marginal. QThread would have been the "Qt native" alternative but again – no experiences.
To keep things simple, I just copied data out of a linked binary image (instead of loading one from file or from anywhere else). Hence, a second file has to be compiled and linked to make this an MCVE – fluffyCat.cc:
/* GIMP RGB C-Source image dump (fluffyCat.cc) */
// manually added types (normally provided by glib)
typedef unsigned guint;
typedef unsigned char guint8;
extern "C" const struct {
guint width;
guint height;
guint bytes_per_pixel; /* 3:RGB, 4:RGBA */
guint8 pixel_data[16 * 16 * 3 + 1];
} fluffyCat = {
16, 16, 3,
"x\211s\215\232\200gw`fx`at[cx^cw^fu\\itZerWn|ap~cv\204jnzedq^fr^kzfhv^Ra"
"GRbMWdR\\jXer^qw_\311\256\226\271\253\235\275\264\252\315\277\260\304\255"
"\231u~i\213\225\207l{fly`jx\\^nRlz_z\206nlx`t~i\221\211s\372\276\243\375"
"\336\275\376\352\340\356\312\301\235\216\212judgwcl~f\212\226u}\206h\212"
"\224q\231\237z\232\236{\216\225v\225\230\200\306\274\244\376\360\327\376"
"\361\331\376\360\341\326\275\272\253\240\244{\203p\202\220xp~e{\204^\222"
"\230n\212\217g\240\242{\234\236z\214\222r\270\271\247\360\353\340\376\370"
"\336\376\363\334\375\357\336\310\254\262\232\223\234\\gRfrX\204\220z\212"
"\225g\225\232j\254\255\177\252\250{\225\226u\304\302\265\374\365\351\376"
"\375\366\376\367\341\376\361\320\374\346\324\306\241\242\237\232\235n{fj"
"xckyfu~fUX#VZCfnT\231\231\207\374\374\371\377\372\354\376\376\374\376\376"
"\372\376\362\332\375\340\301\341\300\264\260\253\262jvdbq\\XkVJTDNTCCG8O"
"TE\322\321\313\377\377\375\376\376\373\376\377\376\376\376\375\376\374\362"
"\376\360\342\344\311\306\250\244\254R_PL^HXkT<#2OP#`dP\217\220\177\374\374"
"\370\377\377\374\376\375\371\377\377\376\376\374\360\377\367\336\376\350"
"\316\342\303\274\246\236\245jtbXdQTdNQYGU\\KchV\317\315\302\377\376\372\377"
"\376\367\376\373\360\377\376\367\376\366\337\376\355\312\374\331\271\323"
"\263\251\216\214\214\\hTP^HL\\FR[LMXI^dW\355\352\342\376\375\366\377\374"
"\360\376\374\361\376\374\361\376\356\321\374\331\264\374\330\266\330\270"
"\260\200||Y`SLVE>K9BJ<CN?VYP\347\330\322\376\366\345\376\363\330\376\367"
"\337\377\372\350\374\342\314\326\243\210\375\350\314\352\317\304shc^`TV`"
"RVbT>B4IS?PTD\244\232\216\374\355\320\376\354\311\376\351\306\376\362\332"
"\374\344\321\267\206u\375\362\337\326\274\272\\POMNBT]LNZH:<*<A*TV>OI;\242"
"\222\207\340\304\243\375\335\262\372\336\272\376\361\334\320\241\212\374"
"\352\322\266\233\237c\\WFH;MR>\\`F~xP\220\214[pqE\211\202\\g]=\230\214`\313"
"\266\207\344\303\240\362\336\274\323\257\201\333\304\240\305\252\204\254"
"\232p\216\206\\\206\203U\232\224b\234\244b\246\257m\220\232`\224\227h~\202"
"W\206\213]\204\210W\227\227i|\177RvzNlsGrtJwtLz}N{\204RlxF",
};
I compiled and tested in VS2013, with Qt 5.9.2 on Windows 10 (64 bit). This is how it looks:

I solved using signal/slot: the "non-GUI" thread emits a signal instead of displaying the images and the called slot paints the QLabel inside the GUI thread!

sndio sio_onmove not calling back.

I'm trying to write a fullduplex test that copies audio in to audio out. sio_onmove does not get called. I have no idea why. Here's my code so far:
#include <stdio.h>
#include <stdlib.h>
#include <sndio.h>
unsigned char buf[0xffff];
struct sio_hdl *hdl;
void cb(void *arg, int delta) {
int l;
printf("call %d\n", delta);
for(;;) {
l = sio_read(hdl, buf, delta);
if(l==0) break;
sio_write(hdl, buf, l);
}
}
int main(void) {
int m, i;
struct sio_par par;
struct sio_cap cap;
hdl = sio_open("rsnd/0", SIO_PLAY | SIO_REC , 1);
sio_getcap(hdl, &cap);
sio_initpar( &par);
par.bits = cap.enc[0].bits;
par.bps = cap.enc[0].bps;
par.sig = cap.enc[0].sig;
par.le = cap.enc[0].le;
par.msb = cap.enc[0].msb;
par.rchan=cap.rchan[0];
par.pchan=cap.pchan[0];
par.rate =cap.rate[0];
par.appbufsz = 1024;
sio_setpar(hdl, &par);
sio_onmove(hdl, cb, NULL);
sio_start(hdl);
for(;;)
sleep(1);
}
I'm initializing rsnd/0 for recording and play back. The parameters I'm initializing from a getcap call. I'm then setting cb as the callback for onmove. I then start audio. From there I loop forever doing nothing

The sio_onmove() call-back is called either from sio_revents() if non-blocking i/o is used or from blocking sio_read() or sio_write().
As above program calls sleep(1) instead, the call-back is never called.
AFAIU, to do the full-duplex test, you could use blocking i/o (set to 0 last argument of the sio_open() function) and do the following steps:
call sio_initpar() to initialize a sio_par structure, as you do
set your preferred parameters in the sio_par structure
call sio_setpar() to submit them to the device. devices exposed through the server (ex. "snd/0") will accept any parameters, while raw devices (ex. "rsnd/0") pick something close to whatever the hardware supports.
call sio_getpar() to get the parameters the device accepted, this is needed to get the device buffer size
possibly check if they are usable by your program
call sio_start()
prime the play buffer by writing par.bufsz samples with sio_write(). This corresponds to: par.bufsz * par.pchan * par.bps bytes.
At this stage, device starts and you could do the main-loop as with the following pseudo-code:
unsigned char *data;
size_t n, todo, blksz;
blksz = par.round * par.rchan * par.bps;
for (;;) {
/* read one block */
data = buf;
todo = blksz;
while (todo > 0) {
n = sio_read(hdl, data, todo);
if (n == 0)
errx(1, "failed");
todo -= n;
data += n;
}
/* write one block */
n = sio_write(hdl, buf, blksz);
if (n != blksz)
errx(1, "failed");
}
The sio_onmove() call-back is not needed for pure audio programs. It's only useful to synchronize non-audio events (ex video, midi messages) to the audio stream.

The time consume is not normal in multi-thread in Windows

The time consume is not normal in multi-thread in Windows. Our device has 5 nozzles, the process is:
The nozzles pick chips up at the same time, so I use the 5 threads do it
Move the nozzles to another place
Put the chips
It's smooth at normal time, but sometimes it has a short stop before moving to another place (we can see it obviously). Picking chips takes about 80 milliseconds at normal time, and sometimes it becomes 130 milliseconds. I write a simple code to test it:
#include "stdafx.h"
#include <WINDOWS.H>
#include <PROCESS.H>
#include <iostream>
#include <Mmsystem.h>
#pragma comment(lib, "winmm.lib")
using namespace std;
static TIMECAPS l_timecaps;
UINT WINAPI MainThread(LPVOID lParam /* = NULL */);
UINT WINAPI TestThread(LPVOID lParam /* = NULL */);
void MainProcess();
int _tmain(int argc, _TCHAR* argv[])
{
//set current process priority as real time
SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
//use more accurate time
timeGetDevCaps(&l_timecaps, sizeof(l_timecaps));
timeBeginPeriod(l_timecaps.wPeriodMin);
UINT uiThreadId = 0;
HANDLE hEvents = (HANDLE) _beginthreadex(NULL, 0, MainThread, NULL, 0, &uiThreadId);
SetThreadPriority(hEvents, THREAD_PRIORITY_TIME_CRITICAL);
WaitForSingleObject(hEvents, INFINITE);
cerr << endl << "Press Enter to exit." << endl;
while (cin.get() != '\n');
timeEndPeriod(l_timecaps.wPeriodMin);
return 0;
}
UINT WINAPI MainThread(LPVOID lParam /* = NULL */)
{
int i = 0;
while (i < 100)
{
MainProcess();
i++;
}
return 0;
}
void MainProcess()
{
const int THREAD_NUMBER = 5;
static HANDLE hEvents[THREAD_NUMBER];
for (int i = 0; i < THREAD_NUMBER; ++i)
hEvents[i] = NULL;
//log time with more accurate time
LARGE_INTEGER liPerfFreq={0};
LARGE_INTEGER liBeginRunTime = {0};
long lBeginRunTime = 0;
QueryPerformanceFrequency(&liPerfFreq);
QueryPerformanceCounter(&liBeginRunTime);
lBeginRunTime = liBeginRunTime.QuadPart * 1000 / liPerfFreq.QuadPart;
for (int i = 0; i < THREAD_NUMBER; ++i)
{
UINT uiThreadId = 0;
hEvents[i] = (HANDLE) _beginthreadex(NULL, 0, TestThread, NULL, 0, &uiThreadId);
SetThreadPriority(hEvents[i], THREAD_PRIORITY_TIME_CRITICAL);
//assign to cpu
SetThreadAffinityMask(hEvents[i], 0x00000001 + i);
}
//wait all threads finished
WaitForMultipleObjects(THREAD_NUMBER, hEvents, TRUE, INFINITE);
LARGE_INTEGER liEndRunTime = {0};
long lEndRunTime = 0;
QueryPerformanceCounter(&liEndRunTime);
lEndRunTime = liEndRunTime.QuadPart * 1000 / liPerfFreq.QuadPart;
cout << "time: " << lEndRunTime - lBeginRunTime << endl;
}
UINT WINAPI TestThread(LPVOID lParam /* = NULL */)
{
//do nothing
return 0;
}
The output result time is 2,3 or 4 millisecond, but sometimes it becomes 57 or 62 millisecond. It's bad for our device when running, the device becomes slow.

Your test threads do nothing. All the time is spent creating and shutting down the thread. Overheads in the kernel object manager and scheduler will dominate. Perhaps some of the threads are having to wait on other threads holding (via API calls) kernel locks and thus seeing delays.
And of course those inner threads could be completing before the call to set their priority completes: to set this you really need to start the thread suspended and then start it.
Because you are measuring nothing, all you have are overheads which will depend on what else is going on.
Also remember, while you have names like THREAD_PRIORITY_TIME_CRITICAL Windows is not a real-time OS.

Does OpenAL-Soft have an upper limit on the number of sources?

I'm using OpenAL-Soft for a project, and right now I'm trying to decide whether I need to implement OpenAL source pooling.
Source pooling is somewhat cumbersome (I need to write code to "allocate" sources, as well as somehow decide when they can be "freed"), but necessary if the number of sources that can be generated by OpenAL is limited.
Since OpenAL-Soft is a software implementation of the OpenAL API, I wonder if the number of sources it can generate is actually limited by the underlying hardware. Theoretically, since all mixing is done in software, there might be no need to actually use one hardware channel per source.
However, I'm not sure about it. How should I proceed?

It appears that OpenAL-Soft indeed does have an upper limit on the number of sources, which can be defined in a config file. The default seems to be 256. It makes sense to limit the number of sources because of the associated CPU and memory costs. Looks like I'll end up implementing a source pool after all.

I just took a peek at its header ... did not see anything pop out.
Here is working code which synthesizes then renders audio buffer data ... you could play with seeing if it accommodates your necessary number of sources
// gcc -o openal_play_wed openal_play_wed.c -lopenal -lm
#include <stdio.h>
#include <stdlib.h> // gives malloc
#include <math.h>
#ifdef __APPLE__
#include <OpenAL/al.h>
#include <OpenAL/alc.h>
#elif __linux
#include <AL/al.h>
#include <AL/alc.h>
#endif
ALCdevice * openal_output_device;
ALCcontext * openal_output_context;
ALuint internal_buffer;
ALuint streaming_source[1];
int al_check_error(const char * given_label) {
ALenum al_error;
al_error = alGetError();
if(AL_NO_ERROR != al_error) {
printf("ERROR - %s (%s)\n", alGetString(al_error), given_label);
return al_error;
}
return 0;
}
void MM_init_al() {
const char * defname = alcGetString(NULL, ALC_DEFAULT_DEVICE_SPECIFIER);
openal_output_device = alcOpenDevice(defname);
openal_output_context = alcCreateContext(openal_output_device, NULL);
alcMakeContextCurrent(openal_output_context);
// setup buffer and source
alGenBuffers(1, & internal_buffer);
al_check_error("failed call to alGenBuffers");
}
void MM_exit_al() {
ALenum errorCode = 0;
// Stop the sources
alSourceStopv(1, & streaming_source[0]); // streaming_source
int ii;
for (ii = 0; ii < 1; ++ii) {
alSourcei(streaming_source[ii], AL_BUFFER, 0);
}
// Clean-up
alDeleteSources(1, &streaming_source[0]);
alDeleteBuffers(16, &streaming_source[0]);
errorCode = alGetError();
alcMakeContextCurrent(NULL);
errorCode = alGetError();
alcDestroyContext(openal_output_context);
alcCloseDevice(openal_output_device);
}
void MM_render_one_buffer() {
/* Fill buffer with Sine-Wave */
// float freq = 440.f;
float freq = 100.f;
float incr_freq = 0.1f;
int seconds = 4;
// unsigned sample_rate = 22050;
unsigned sample_rate = 44100;
double my_pi = 3.14159;
size_t buf_size = seconds * sample_rate;
short * samples = malloc(sizeof(short) * buf_size);
printf("\nhere is freq %f\n", freq);
int i=0;
for(; i<buf_size; ++i) {
samples[i] = 32760 * sin( (2.f * my_pi * freq)/sample_rate * i );
freq += incr_freq;
// incr_freq += incr_freq;
// freq *= factor_freq;
if (100.0 > freq || freq > 5000.0) {
incr_freq *= -1.0f;
}
}
/* upload buffer to OpenAL */
alBufferData( internal_buffer, AL_FORMAT_MONO16, samples, buf_size, sample_rate);
al_check_error("populating alBufferData");
free(samples);
/* Set-up sound source and play buffer */
// ALuint src = 0;
// alGenSources(1, &src);
// alSourcei(src, AL_BUFFER, internal_buffer);
alGenSources(1, & streaming_source[0]);
alSourcei(streaming_source[0], AL_BUFFER, internal_buffer);
// alSourcePlay(src);
alSourcePlay(streaming_source[0]);
// ---------------------
ALenum current_playing_state;
alGetSourcei(streaming_source[0], AL_SOURCE_STATE, & current_playing_state);
al_check_error("alGetSourcei AL_SOURCE_STATE");
while (AL_PLAYING == current_playing_state) {
printf("still playing ... so sleep\n");
sleep(1); // should use a thread sleep NOT sleep() for a more responsive finish
alGetSourcei(streaming_source[0], AL_SOURCE_STATE, & current_playing_state);
al_check_error("alGetSourcei AL_SOURCE_STATE");
}
printf("end of playing\n");
/* Dealloc OpenAL */
MM_exit_al();
} // MM_render_one_buffer
int main() {
MM_init_al();
MM_render_one_buffer();
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How to increase speed of large for loops - multithreading

OpenMP is the easiest to implement and try. Just add a couple of lines at your CMakeLists.txt, an include and the famous #pragma omp parallel for line just before your for loop.

Related

Missing closing brace in SFML program?

Qt C++ Displaying images outside the GUI thread (Boost thread)

sndio sio_onmove not calling back.

The time consume is not normal in multi-thread in Windows

Does OpenAL-Soft have an upper limit on the number of sources?

Categories

Resources