Inconsistency in performance of empty kernel with multi threads

Inconsistency in performance of empty kernel with multi threads - multithreading

Below code is to check the performance of the empty kernels (to verify the dispatch rate of the kernel) with multi threads using std async.
#include <stdio.h>
#include <stddef.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <algorithm>
#include <atomic>
#include <thread>
#include <future>
#include <functional>
#define WARMUP_RUN_COUNT 10
#define TIMING_RUN_COUNT 100
#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
__global__ void EmptyKernel() {}
void print_timing(std::string test, std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1)
{
float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
// remove top outliers due to nature of variability across large number of multi-threaded runs
std::sort(results.begin(), results.end(), std::greater<float>());
auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
auto end_iter = results.end();
// mean
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
total_us += (run_ms * 1000) / batch;
});
mean_us = total_us / TIMING_RUN_COUNT;
// stddev
total_us = 0;
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
float dev_us = ((run_ms * 1000) / batch) - mean_us;
total_us += dev_us * dev_us;
});
stddev_us = sqrt(total_us / TIMING_RUN_COUNT);
printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
}
void kernel_enqueue_rate(std::atomic_int* shared, int max_threads)
{
//resources necessary for this thread
cudaStream_t stream;
cudaStreamCreate(&stream);
std::array<float, TOTAL_RUN_COUNT> results;
//synchronize all threads, before running
int tid = shared->fetch_add(1, std::memory_order_release);
while (max_threads != shared->load(std::memory_order_acquire)) {}
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start = std::chrono::high_resolution_clock::now();
EmptyKernel<<<1, 1, 0, stream>>>();
auto stop = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<double, std::milli>(stop - start).count();
}
print_timing("Thread ID : " + std::to_string(tid) + " , " + "Kernel enqueue rate", results);
}
// Simple thread pool
struct thread_pool {
thread_pool(int total_threads) : max_threads(total_threads) {}
void start(std::function<void(std::atomic_int*, int)> f) {
for (int i = 0; i < max_threads; ++i) {
threads.push_back(std::async(std::launch::async, f, &shared, max_threads));
}
}
void finish() {
for (auto&&thread : threads) {
thread.get();
}
threads.clear();
shared = {0};
}
~thread_pool() {
finish();
}
private:
std::atomic_int shared {0};
std::vector<std::future<void>> threads;
int max_threads = 1;
};
int main(int argc, char* argv[])
{
int max_threads = 4;
thread_pool task(max_threads);
task.start(kernel_enqueue_rate);
task.finish();
}
The observation is that few threads takes much more time than the other threads, for example in the below run, 2 threads take approx 6 us but the other 2 threads take close to or more than 10 us.
Thread ID : 0 , Kernel enqueue rate enqueue rate: 9.5 us, std: 9.3 us
Thread ID : 2 , Kernel enqueue rate enqueue rate: 5.7 us, std: 2.9 us
Thread ID : 1 , Kernel enqueue rate enqueue rate: 11.7 us, std: 7.3 us
Thread ID : 3 , Kernel enqueue rate enqueue rate: 6.0 us, std: 2.1 us
what is the reason for this significant difference and is there a way to avoid this and get similar results on all the threads.

what is the reason for this significant difference
The threads are migrating across cores and contending for the cores alongside other processes. The interference effects are not uniform across threads.
is there a way to avoid this and get similar results on all the threads
By pinning the threads to cores which can be done using taskset or programmatically as explained here

Related

How should you use C++14 shared mutex with lambda captures and multiple threads?

I have some very simple code which is supposed to test a multi-threaded logger by starting 10 threads at the same time which will all write to the logger at once.
I expect to see all 10 messages, not in any order; However, I randomly get 5,6,7,8,9, and sometimes 10 output messages.
Here is the code:
//*.cxx
#include <iostream>
#include <mutex>
#include <shared_mutex> // requires c++14
#include <string>
#include <thread>
#include <vector>
namespace {
std::mutex g_msgLock;
std::shared_timed_mutex g_testingLock;
}
void info(const char * msg) {
std::unique_lock<std::mutex> lock(g_msgLock);
std::cout << msg << '\n'; // don't flush
}
int main(int argc, char** argv) {
info("Start message..");
std::vector<std::thread> threads;
unsigned int threadCount = 10;
threads.reserve(threadCount);
{ // Scope for locking all threads
std::lock_guard<std::shared_timed_mutex> lockAllThreads(g_testingLock); // RAII (scoped) lock
for (unsigned int i = 0; i < threadCount; i++) {
// Here we start the threads using lambdas
threads.push_back(std::thread([&, i](){
// Here we block and wait on lockAllThreads
std::shared_lock<std::shared_timed_mutex> threadLock(g_testingLock);
std::string msg = std::string("THREADED_TEST_INFO_MESSAGE: ") + std::to_string(i);
info(msg.c_str());
}));
}
} // End of scope, lock is released, all threads continue now
for(auto& thread : threads){
thread.join();
}
}
The output is generally something of the form:
Start message..
THREADED_TEST_INFO_MESSAGE: 9
THREADED_TEST_INFO_MESSAGE: 5
THREADED_TEST_INFO_MESSAGE: 3
THREADED_TEST_INFO_MESSAGE: 1
THREADED_TEST_INFO_MESSAGE: 4
THREADED_TEST_INFO_MESSAGE: 0
THREADED_TEST_INFO_MESSAGE: 8
THREADED_TEST_INFO_MESSAGE: 7
Notice that there are only 8 outputs for this run.

Interestingly enough, this problem was associated with my build system which was dropping messages. The executable is always producing the outputs as expected.

Clock frequency setting doesn't change simulation speed

I'm trying to run the following AVR program on SimAVR:
#include <avr/io.h>
#include <util/delay.h>
int main ()
{
DDRB |= _BV(DDB5);
for (;;)
{
PORTB ^= _BV(PB5);
_delay_ms(2000);
}
}
I've compiled it with F_CPU=16000000. The SimAVR runner is as follows:
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include "sim_avr.h"
#include "avr_ioport.h"
#include "sim_elf.h"
avr_t * avr = NULL;
static void* avr_run_thread(void * ignore)
{
for (;;) {
avr_run(avr);
}
return NULL;
}
void led_changed_hook(struct avr_irq_t* irq, uint32_t value, void* param)
{
printf("led_changed_hook %d %d\n", irq->irq, value);
}
int main(int argc, char *argv[])
{
elf_firmware_t f;
elf_read_firmware("image.elf", &f);
f.frequency = 16e6;
const char *mmcu = "atmega328p";
avr = avr_make_mcu_by_name(mmcu);
if (!avr) {
fprintf(stderr, "%s: AVR '%s' not known\n", argv[0], mmcu);
exit(1);
}
avr_init(avr);
avr_load_firmware(avr, &f);
avr_irq_register_notify(
avr_io_getirq(avr, AVR_IOCTL_IOPORT_GETIRQ('B'), 5),
led_changed_hook,
NULL);
pthread_t run;
pthread_create(&run, NULL, avr_run_thread, NULL);
for (;;) {}
}
The problem is that I see from the output of led_changed_hook that it runs at ~4x speed. Moreover, changing f.frequency doesn't seem to have any effect on the simulation speed whatsoever.
How do I ensure that SimAVR runs the simulation at the correct real-time speed?

It turns out SimAVR doesn't support timing-accurate simulation of opcodes so the simulation time of running the busy-wait of _delay_ms to completion is completely unrelated to
how long it would take on the real MCU
the clock frequency of the simulated MCU
The correct solution is to use a timer interrupt, and then go to sleep on the MCU. The simulator will correctly simulate the timer counters and the sleep will suspend the simulation until the timer fires.
#include <avr/interrupt.h>
#include <avr/power.h>
#include <avr/sleep.h>
int main ()
{
DDRB |= _BV(DDB5);
TCCR1A = 0;
TCCR1B = 0;
TCNT1 = 0;
TIMSK1 |= (1 << OCIE1A);
sei();
/* Set TIMER1 to 0.5 Hz */
TCCR1B |= (1 << WGM12);
OCR1A = 31248;
TCCR1B |= ((1 << CS12) | (1 << CS10));
set_sleep_mode(SLEEP_MODE_IDLE);
sleep_enable();
for (;;)
{
sleep_mode();
}
}
ISR(TIMER1_COMPA_vect){
PORTB ^= _BV(PB5);
}

Decrease in Random read IOPs on NVME SSD if requests issued over small region

(TL;DR) On NVME SSDs (Intel p3600 as well as Avant), I am seeing decrease in the IOPS if I issue random reads over a small subset of the disk instead of the entire disk.
While reading the same offset over and over, the IOPS are about 36-40K for 4k blocksize. The IOPS gradually increase as I grow the region over which random reads are being issued. The program (seen below) uses asynchronous IO on Linux to submit the read requests.
Disk Range(in 4k blocks), IOPS
0, 38833
1, 68596
10, 76100
30, 80381
40, 113647
50, 148205
100, 170374
200, 239798
400, 270197
800, 334767
OS : Linux 4.2.0-35-generic
SSD : Intel P3600 NVME Flash
What could be causing this problem ?
The program can be run as follows
$ for i in 0 1 10 30 40 50 100 200 400 800
do
<program_name> /dev/nvme0n1 10 $i
done
and validate if you also see the increasing pattern of IOPS seen above
/**
* $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3
* $ progname /dev/nvme0n1 10 100
*/
#include <random>
#include <libaio.h>
#include <stdlib.h>//malloc, exit
#include <future> //async
#include <unistd.h> //usleep
#include <iostream>
#include <sys/time.h> // gettimeofday
#include <vector>
#include <fcntl.h> // open
#include <errno.h>
#include <sys/types.h> // open
#include <sys/stat.h> // open
#include <cassert>
#include <semaphore.h>
io_context_t ioctx;
std::vector<char*> buffers;
int fd = -1;
sem_t sem;
constexpr int numPerRound = 20;
constexpr int numRounds = 100000;
constexpr int MAXEVENT = 10;
constexpr size_t BLKSIZE = 4096;
constexpr int QDEPTH = 200;
off_t startBlock = 0;
off_t numBlocks = 100;
const int numSubmitted = numRounds * numPerRound;
void DoGet()
{
io_event eventsArray[MAXEVENT];
int numCompleted = 0;
while (numCompleted != numSubmitted)
{
bzero(eventsArray, MAXEVENT * sizeof(io_event));
int numEvents;
do {
numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr);
} while (numEvents == -EINTR);
for (int i = 0; i < numEvents; i++)
{
io_event* ev = &eventsArray[i];
iocb* cb = (iocb*)(ev->data);
assert(ev->res2 == 0);
assert(ev->res == BLKSIZE);
sem_post(&sem); // free ioctx
}
numCompleted += numEvents;
}
std::cout << "completed=" << numCompleted << std::endl;
}
int main(int argc, char* argv[])
{
if (argc == 1) {
std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl;
exit(1);
}
char* deviceName = argv[1];
startBlock = atoll(argv[2]);
numBlocks = atoll(argv[3]);
int ret = 0;
ret = io_queue_init(QDEPTH, &ioctx);
assert(ret == 0);
ret = sem_init(&sem, 0, QDEPTH);
assert(ret == 0);
auto DoGetFut = std::async(std::launch::async, DoGet);
// preallocate buffers
for (int i = 0; i < QDEPTH; i++)
{
char* buf ;
ret = posix_memalign((void**)&buf, 4096, BLKSIZE);
assert(ret == 0);
buffers.push_back(buf);
}
fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY);
assert(fd >= 0);
off_t offset = 0;
struct timeval start;
gettimeofday(&start, 0);
std::mt19937 generator (getpid());
// generate random offsets within [startBlock, startBlock + numBlocks]
std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks);
for (int j = 0; j < numRounds; j++)
{
iocb mycb[numPerRound];
iocb* posted[numPerRound];
bzero(mycb, sizeof(iocb) * numPerRound);
for (int i = 0; i < numPerRound; i++)
{
// same buffer may get used in 2 different async read
// thats ok - not validating content in this program
char* iobuf = buffers[i];
iocb* cb = &mycb[i];
offset = offsetgen(generator) * BLKSIZE;
io_prep_pread(cb, fd, iobuf, BLKSIZE, offset);
cb->data = iobuf;
posted[i] = cb;
sem_wait(&sem); // wait for ioctx to be free
}
int ret = 0;
do {
ret = io_submit(ioctx, numPerRound, posted);
} while (ret == -EINTR);
assert(ret == numPerRound);
}
DoGetFut.wait();
struct timeval end;
gettimeofday(&end, 0);
uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);
io_queue_release(ioctx);
std::cout
<< "ops=" << numRounds * numPerRound
<< " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff
<< " region-size=" << (numBlocks * BLKSIZE)
<< std::endl;
}

Surely it is to do with the structure of the memory. Internally this drive is built from many memory chips and may have multiple memory buses internally. If you do requests across a small range all the requests will resolve to a single or few chips and will have to be queued. If you access across the whole device then the multiple request are across many internal chips and buses and can be run asynchronously so will provide more throughput.

The time consume is not normal in multi-thread in Windows

The time consume is not normal in multi-thread in Windows. Our device has 5 nozzles, the process is:
The nozzles pick chips up at the same time, so I use the 5 threads do it
Move the nozzles to another place
Put the chips
It's smooth at normal time, but sometimes it has a short stop before moving to another place (we can see it obviously). Picking chips takes about 80 milliseconds at normal time, and sometimes it becomes 130 milliseconds. I write a simple code to test it:
#include "stdafx.h"
#include <WINDOWS.H>
#include <PROCESS.H>
#include <iostream>
#include <Mmsystem.h>
#pragma comment(lib, "winmm.lib")
using namespace std;
static TIMECAPS l_timecaps;
UINT WINAPI MainThread(LPVOID lParam /* = NULL */);
UINT WINAPI TestThread(LPVOID lParam /* = NULL */);
void MainProcess();
int _tmain(int argc, _TCHAR* argv[])
{
//set current process priority as real time
SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
//use more accurate time
timeGetDevCaps(&l_timecaps, sizeof(l_timecaps));
timeBeginPeriod(l_timecaps.wPeriodMin);
UINT uiThreadId = 0;
HANDLE hEvents = (HANDLE) _beginthreadex(NULL, 0, MainThread, NULL, 0, &uiThreadId);
SetThreadPriority(hEvents, THREAD_PRIORITY_TIME_CRITICAL);
WaitForSingleObject(hEvents, INFINITE);
cerr << endl << "Press Enter to exit." << endl;
while (cin.get() != '\n');
timeEndPeriod(l_timecaps.wPeriodMin);
return 0;
}
UINT WINAPI MainThread(LPVOID lParam /* = NULL */)
{
int i = 0;
while (i < 100)
{
MainProcess();
i++;
}
return 0;
}
void MainProcess()
{
const int THREAD_NUMBER = 5;
static HANDLE hEvents[THREAD_NUMBER];
for (int i = 0; i < THREAD_NUMBER; ++i)
hEvents[i] = NULL;
//log time with more accurate time
LARGE_INTEGER liPerfFreq={0};
LARGE_INTEGER liBeginRunTime = {0};
long lBeginRunTime = 0;
QueryPerformanceFrequency(&liPerfFreq);
QueryPerformanceCounter(&liBeginRunTime);
lBeginRunTime = liBeginRunTime.QuadPart * 1000 / liPerfFreq.QuadPart;
for (int i = 0; i < THREAD_NUMBER; ++i)
{
UINT uiThreadId = 0;
hEvents[i] = (HANDLE) _beginthreadex(NULL, 0, TestThread, NULL, 0, &uiThreadId);
SetThreadPriority(hEvents[i], THREAD_PRIORITY_TIME_CRITICAL);
//assign to cpu
SetThreadAffinityMask(hEvents[i], 0x00000001 + i);
}
//wait all threads finished
WaitForMultipleObjects(THREAD_NUMBER, hEvents, TRUE, INFINITE);
LARGE_INTEGER liEndRunTime = {0};
long lEndRunTime = 0;
QueryPerformanceCounter(&liEndRunTime);
lEndRunTime = liEndRunTime.QuadPart * 1000 / liPerfFreq.QuadPart;
cout << "time: " << lEndRunTime - lBeginRunTime << endl;
}
UINT WINAPI TestThread(LPVOID lParam /* = NULL */)
{
//do nothing
return 0;
}
The output result time is 2,3 or 4 millisecond, but sometimes it becomes 57 or 62 millisecond. It's bad for our device when running, the device becomes slow.

Your test threads do nothing. All the time is spent creating and shutting down the thread. Overheads in the kernel object manager and scheduler will dominate. Perhaps some of the threads are having to wait on other threads holding (via API calls) kernel locks and thus seeing delays.
And of course those inner threads could be completing before the call to set their priority completes: to set this you really need to start the thread suspended and then start it.
Because you are measuring nothing, all you have are overheads which will depend on what else is going on.
Also remember, while you have names like THREAD_PRIORITY_TIME_CRITICAL Windows is not a real-time OS.

Keeping number of threads constant with pthread in C

I tried to find a solution in order to keep the number of working threads constant under linux in C using pthreads, but I seem to be unable to fully understand what's wrong with the following code:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define MAX_JOBS 50
#define MAX_THREADS 5
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
int jobs = MAX_JOBS;
int worker = 0;
int counter = 0;
void *functionC() {
pthread_mutex_lock(&mutex1);
worker++;
counter++;
printf("Counter value: %d\n",counter);
pthread_mutex_unlock(&mutex1);
// Do something...
sleep(4);
pthread_mutex_lock(&mutex1);
jobs--;
worker--;
printf(" >>> Job done: %d\n",jobs);
pthread_mutex_unlock(&mutex1);
}
int main(int argc, char *argv[]) {
int i=0, j=0;
pthread_t thread[MAX_JOBS];
// Create threads if the number of working threads doesn't exceed MAX_THREADS
while (1) {
if (worker > MAX_THREADS) {
printf(" +++ In queue: %d\n", worker);
sleep(1);
} else {
//printf(" +++ Creating new thread: %d\n", worker);
pthread_create(&thread[i], NULL, &functionC, NULL);
//printf("%d",worker);
i++;
}
if (i == MAX_JOBS) break;
}
// Wait all threads to finish
for (j=0;j<MAX_JOBS;j++) {
pthread_join(thread[j], NULL);
}
return(0);
}
A while (1) loop keeps creating threads if the number of working threads is under a certain threshold. A mutex is supposed to lock the critical sections every time the global counter of the working threads is incremented (thread creation) and decremented (job is done). I thought it could work fine and for the most part it does, but weird things happen...
For instance, if I comment (as it is in this snippet) the printf //printf(" +++ Creating new thread: %d\n", worker); the while (1) seems to generate a random number (18-25 in my experience) threads (functionC prints out "Counter value: from 1 to 18-25"...) at a time instead of respecting the IF condition inside the loop. If I include the printf the loop seems to behave "almost" in the right way... This seems to hint that there's a missing "mutex" condition that I should add to the loop in main() to effectively lock the thread when MAX_THREADS is reached but after changing a LOT of times this code for the past few days I'm a bit lost, now. What am I missing?
Please, let me know what I should change in order to keep the number of threads constant it doesn't seem that I'm too far from the solution... Hopefully... :-)
Thanks in advance!

Your problem is that worker is not incremented until the new thread actually starts and gets to run - in the meantime, the main thread loops around, checks workers, finds that it hasn't changed, and starts another thread. It can repeat this many times, creating far too many threads.
So, you need to increment worker in the main thread, when you've decided to create a new thread.
You have another problem - you should be using condition variables to let the main thread sleep until it should start another thread, not using a busy-wait loop with a sleep(1); in it. The complete fixed code would look like:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <unistd.h>
#define MAX_JOBS 50
#define MAX_THREADS 5
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond1 = PTHREAD_COND_INITIALIZER;
int jobs = MAX_JOBS;
int workers = 0;
int counter = 0;
void *functionC() {
pthread_mutex_lock(&mutex1);
counter++;
printf("Counter value: %d\n",counter);
pthread_mutex_unlock(&mutex1);
// Do something...
sleep(4);
pthread_mutex_lock(&mutex1);
jobs--;
printf(" >>> Job done: %d\n",jobs);
/* Worker is about to exit, so decrement count and wakeup main thread */
workers--;
pthread_cond_signal(&cond1);
pthread_mutex_unlock(&mutex1);
return NULL;
}
int main(int argc, char *argv[]) {
int i=0, j=0;
pthread_t thread[MAX_JOBS];
// Create threads if the number of working threads doesn't exceed MAX_THREADS
while (i < MAX_JOBS) {
/* Block on condition variable until there are insufficient workers running */
pthread_mutex_lock(&mutex1);
while (workers >= MAX_THREADS)
pthread_cond_wait(&cond1, &mutex1);
/* Another worker will be running shortly */
workers++;
pthread_mutex_unlock(&mutex1);
pthread_create(&thread[i], NULL, &functionC, NULL);
i++;
}
// Wait all threads to finish
for (j=0;j<MAX_JOBS;j++) {
pthread_join(thread[j], NULL);
}
return(0);
}
Note that even though this works, it isn't ideal - it's best to create the number of threads you want up-front, and have them loop around, waiting for work. This is because creating and destroying threads has significant overhead, and because it often simplifies resource management. A version of your code rewritten to work like this would look like:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <unistd.h>
#define MAX_JOBS 50
#define MAX_THREADS 5
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
int jobs = MAX_JOBS;
int counter = 0;
void *functionC()
{
int running_job;
pthread_mutex_lock(&mutex1);
counter++;
printf("Counter value: %d\n",counter);
while (jobs > 0) {
running_job = jobs--;
pthread_mutex_unlock(&mutex1);
printf(" >>> Job starting: %d\n", running_job);
// Do something...
sleep(4);
printf(" >>> Job done: %d\n", running_job);
pthread_mutex_lock(&mutex1);
}
pthread_mutex_unlock(&mutex1);
return NULL;
}
int main(int argc, char *argv[]) {
int i;
pthread_t thread[MAX_THREADS];
for (i = 0; i < MAX_THREADS; i++)
pthread_create(&thread[i], NULL, &functionC, NULL);
// Wait all threads to finish
for (i = 0; i < MAX_THREADS; i++)
pthread_join(thread[i], NULL);
return 0;
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Inconsistency in performance of empty kernel with multi threads - multithreading

Related

How should you use C++14 shared mutex with lambda captures and multiple threads?

Clock frequency setting doesn't change simulation speed

Decrease in Random read IOPs on NVME SSD if requests issued over small region

The time consume is not normal in multi-thread in Windows

Keeping number of threads constant with pthread in C

Categories

Resources