Reduction in running time of multiple modules due to linking - linux

This question may seem very vague, hence I have included the code snippets for the modules mentioned. I have written a program that collects data from various sensors on an I2C bus and stores the formatted values in a file. This shall run on an ARM cortex A9 processor (single core) in an SoC configuration called Zedboard by Xilinx, and uses the petalinux operating system with the vanilla linux kernel. The time is being measured using clock_gettime(). I have noticed significant reduction in a single sensor access time when all of the sensors are being accessed sequentially within a single process. The comparison of this time was done with that of individual processes that access a single sensor only and do not write the data to a file, but print it to stdout instead.
Sensors used along with modules:
GY521 Module:
#include <linux/i2c-dev-user.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdint.h>
#include <inttypes.h>
#include "GY521.h"
#include <time.h>
#define ADDR 0x68
static int file;
static __s32 res;
static __u8 reg;
static __u8 values[14]; //array to hold all the register values
void set_sleep_gy521(int flag)
{
if(flag==0) //wake up the device
{
//Accessing reg 107
reg = 0x6B;
uint8_t val8 = 0x01; //write 0x00 if you want to set the internal 8MHz oscillator as CLK
res = i2c_smbus_write_byte_data(file, reg, val8);
if(res<0)
perror("Failed to wake it up");
/*else
printf("Device is awake\n");*/
}
else //set it to sleep
{
reg = 0x6B;
uint8_t val8 = 0x41; //write 0x40 if you want to set the internal 8MHz oscillator as CLK
res = i2c_smbus_write_byte_data(file, reg, val8);
if(res<0)
perror("Failed to go to sleep");
/*else
printf("In sleep mode\n");*/
}
}
void init_gy521()
{
char filename[20];
int adapter_no = 0;
snprintf(filename, 19, "/dev/i2c-%d", adapter_no);
file = open(filename, O_RDWR);
if(file<0)
{
perror("File not opened");
exit(1);
}
if(ioctl(file, I2C_SLAVE, ADDR)<0)
{
perror("Not able to access the device");
exit(EXIT_FAILURE);
}
//setting the sensitivity of the gyroscope and accelerometer
res = i2c_smbus_write_byte_data(file, 0x1B, 0x00);
if(res<0)
perror("Failed to set gyro range");
res = i2c_smbus_write_byte_data(file, 0x1C, 0x00);
if(res<0)
perror("Failed to set the accelerometer range");
set_sleep_gy521(0); //this also sets the clock source to X-axis gyro reference which is slightly better than the internal 8MHz oscillator
}
//get_values() stores all the register measurements in the array values
int get_values()
{
//reading all the values needed at once in a block
res = i2c_smbus_read_i2c_block_data(file, 0x3B, 14, (__u8*)values);
if(res<0)
perror("Failed to read using Block");
return res;
}
float get_Ax()
{
int c = get_values(); //calls get_values() to get all values at a time instant
int16_t xout;
if(c>0)
xout = (((int16_t)values[0])<<8) | values[1];
else
{
perror("Can't get the values");
exit(EXIT_FAILURE);
}
return xout/16384.0*9.8;
}
float get_Ay()
{
//concatenate the higher byte and the lower byte
int16_t yout = (((int16_t)values[2])<<8) | values[3];
return yout/16384.0*9.8;
}
float get_Az()
{
int16_t zout = (((int16_t)values[4])<<8) | values[5];
return zout/16384.0*9.8;
}
float get_temp_gy521()
{
__s16 temp = (((int16_t)values[6])<<8) | values[7];
return (temp/340.0 + 36.53);
}
float get_Wx()
{
__s16 xgyro = (((int16_t)values[8])<<8) | values[9];
return xgyro/131.0;
}
float get_Wy()
{
__s16 ygyro = (((int16_t)values[10])<<8) | values[11];
return ygyro/131.0;
}
float get_Wz()
{
__s16 zgyro = (((int16_t)values[12])<<8) | values[13];
return zgyro/131.0;
}
void clear_gy521()
{
close(file);
}
int main()
{
struct timespec start, end;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
init_gy521();
printf("Wx: %f\n", get_Wx());
printf("Wy: %f\n", get_Wy());
printf("Wz: %f\n", get_Wz());
printf("Ax: %f\n", get_Ax());
printf("Ay: %f\n", get_Ay());
printf("Az: %f\n", get_Az());
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
printf("Time taken by GY521 is %d MuS\n", (end.tv_sec-start.tv_sec)*1000000L+(end.tv_nsec-start.tv_nsec)/1000);
}
LM75 Module:
#include <linux/i2c-dev-user.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <time.h>
#define ADDRESS 0x48
static int file; //use static keyword to ensure that the scope of this variable is limited to this file.
static __u8 buffer[2];
int get_temp()
{
if(i2c_smbus_read_i2c_block_data(file, 0x00, 2, buffer)<0)
perror("Failed to read the block");
return buffer[0]&127;
}
//Initializes the file used by the userspace calls. [IMPORTANT] Must be run before any other function is called for this device!. This needs to be called only once for each process.
void init_LM75()
{
int adapter_number = 0; //check this.
char filename[20];
snprintf(filename, 19, "/dev/i2c-%d", adapter_number);
file = open(filename, O_RDWR);
if(file<0)
{
perror("File not opened");
exit(1);
}
if(ioctl(file, I2C_SLAVE, ADDRESS)<0)
{
perror("ioctl could not open file");
exit(1);
}
}
int main()
{
struct timespec start, end;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
init_LM75();
printf("Temperature is %d\n", get_temp());
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
printf("Time taken %d\n", (end.tv_sec-start.tv_sec)*1000000L+(end.tv_nsec-start.tv_nsec)/1000);
}
HMC5883L Module:
#include <linux/i2c-dev-user.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include "HMC5883L.h"
#include <time.h>
#define ADDRESS 0x1e
static int file; //use static keyword to ensure that the scope of this variable is limited to this file.
static float factor;
static __u8 buffer[6];
//register addresses
__u8 config_reg_A = 0x00;
__u8 mode_reg = 0x02;
__u8 gain_reg = 0x01;
__u8 data_X_H = 0x03;
__u8 data_X_L = 0x04;
__u8 data_Y_H = 0x07;
__u8 data_Y_L = 0x08;
__u8 data_Z_H = 0x05;
__u8 data_Z_L = 0x06;
/**
* The value of mode must be according to the following table:
* Value Mode
* 0 Continuous
* 1 Single (Default)
* 2 Idle
* 3 Idle
*
* After any mode change care must be taken to set it back to continuous mode before reading any values.
**/
void set_magnetometer_mode(int mode)
{
__u8 value = 0x00;
value |= mode;
if(i2c_smbus_write_byte_data(file, mode_reg, value)<0)
perror("Failed to change magnetometer mode");
}
void get_B()
{
if(i2c_smbus_read_i2c_block_data(file, data_X_H, 6, buffer)<0)
perror("Failed to read the block");
}
//[IMPORTANT] Note that the following 3 functions will return the field values in milli gauss by reading them from the buffer. So call get_Bx() first!
float get_Bx()
{
get_B();
int16_t temp;
//concatenate the upper and lower bits
temp = buffer[0];
int16_t b_X = (temp<<8) | buffer[1];
return (float)b_X*factor;
}
float get_By()
{
int16_t temp;
//concatenate the upper and lower bits
temp = buffer[4];
int16_t b_Y = (temp<<8) | buffer[5];
return (float)b_Y*factor;
}
float get_Bz()
{
int16_t temp;
//concatenate the upper and lower bits
temp = buffer[2];
int16_t b_Z = (temp<<8) | buffer[3];
return (float)b_Z*factor;
}
//Initializes the file used by the userspace calls. [IMPORTANT] Must be run before any other function is called for this device!. This needs to be called only once for each process.
void init_magnetometer()
{
int adapter_number = 0; //check this.
char filename[20];
snprintf(filename, 19, "/dev/i2c-%d", adapter_number);
file = open(filename, O_RDWR);
if(file<0)
{
perror("File not opened");
exit(1);
}
if(ioctl(file, I2C_SLAVE, ADDRESS)<0)
{
perror("ioctl could not open file");
exit(1);
}
factor = 0.92;
set_magnetometer_mode(0);
}
void clear_magnetometer()
{
close(file);
}
/**
* The value of freq must be according to the following table:
* Value Rate (Hz)
* 0 0.75
* 1 1.5
* 2 3
* 3 7.5
* 4 15 (Default)
* 5 30
* 6 75
**/
void set_magnetometer_frequency(int freq)
{
__u8 value = 0x00;
value |= freq<<2;
if(i2c_smbus_write_byte_data(file, config_reg_A, value)<0)
perror("Failed to change data rate");
}
/**
* The value of gain must be according to the following table:
* Value Field Range (+/- Gauss)
* 0 0.88
* 1 1.3 (Default)
* 2 1.9
* 3 2.5
* 4 4.0
* 5 4.7
* 6 5.6
* 7 8.1
*
* This function will also set the value of the factor to be multiplied to the raw data.
**/
void set_magnetometer_gain(int gain)
{
__u8 value = 0x00;
value |= gain<<5;
if(i2c_smbus_write_byte_data(file, gain_reg, value)<0)
perror("Failed to change magnetometer gain");
else
{
switch(gain)
{
case 0: factor = 0.73; break;
case 1: factor = 0.92; break;
case 2: factor = 1.22; break;
case 3: factor = 1.52; break;
case 4: factor = 2.27; break;
case 5: factor = 2.56; break;
case 6: factor = 3.03; break;
case 7: factor = 4.35; break;
}
}
}
int main()
{
struct timespec start, end;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
init_magnetometer();
printf("%f\t%f\t%f\n", get_Bx(), get_By(), get_Bz());
clear_magnetometer();
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
printf("Time taken by HMC is %d MuS\n", (end.tv_sec-start.tv_sec)*1000000L+(end.tv_nsec-start.tv_nsec)/1000);
}
Single module that clubs all the three together and also writes data in a file:
#include <stdio.h>
#include <stdlib.h>
#include "hwfunctions.h"
#include <time.h>
int main()
{
struct timespec start_hk, end_hk, start_hmc, end_hmc, start_gy, end_gy, start_lm, end_lm;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start_hk);
char *finalstr = (char* ) malloc(50);
FILE *f = fopen("fullhk.txt", "a");
if(f==NULL)
{
perror("Couldn't open file\n");
exit(0);
}
//initialization of the three sensors
//init_gy80();
time_t curt;
time(&curt);
//fseek(f, 0, SEEK_END);
sprintf(finalstr, "Time: %s\n", ctime(&curt));fputs(finalstr, f);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start_hmc);
init_magnetometer();
sprintf(finalstr, "Bx: %f\n", get_Bx());fputs(finalstr, f);
sprintf(finalstr, "By: %f\n", get_By());fputs(finalstr, f);
sprintf(finalstr, "Bz: %f\n", get_Bz());fputs(finalstr, f);
clear_magnetometer();
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end_hmc);
sprintf(finalstr, "S1: %f\n", get_S1());fputs(finalstr, f);
sprintf(finalstr, "S2: %f\n", get_S2());fputs(finalstr, f);
sprintf(finalstr, "S3: %f\n", get_S3());fputs(finalstr, f);
sprintf(finalstr, "S4: %f\n", get_S4());fputs(finalstr, f);
sprintf(finalstr, "S5: %f\n", get_S5());fputs(finalstr, f);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start_lm);
init_LM75();
sprintf(finalstr, "Temperature: %d\n", get_temp());fputs(finalstr, f);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end_lm);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start_gy);
init_gy521();
sprintf(finalstr, "Wy: %f\n", get_Wy());fputs(finalstr, f);
sprintf(finalstr, "Wz: %f\n", get_Wz());fputs(finalstr, f);
sprintf(finalstr, "Ax: %f\n", get_Ax());fputs(finalstr, f);
sprintf(finalstr, "Ay: %f\n", get_Ay());fputs(finalstr, f);
sprintf(finalstr, "Az: %f *end of block*\n\n", get_Az());
clear_gy521();
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end_gy);
fputs(finalstr, f);
fclose(f);
//closing the three sensors
//clear_gy80();
free(finalstr);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end_hk);
printf("Time taken by single hmc instance: %ld microseconds\n", (end_hmc.tv_sec-start_hmc.tv_sec)*1000000L + (end_hmc.tv_nsec-start_hmc.tv_nsec)/1000);
printf("Time taken by single gy instance: %ld microseconds\n", (end_gy.tv_sec-start_gy.tv_sec)*1000000L + (end_gy.tv_nsec-start_gy.tv_nsec)/1000);
printf("Time taken by single lm instance: %ld microseconds\n", (end_lm.tv_sec-start_lm.tv_sec)*1000000L + (end_lm.tv_nsec-start_lm.tv_nsec)/1000);
printf("Time taken by single housekeeping instance: %ld microseconds\n", (end_hk.tv_sec-start_hk.tv_sec)*1000000L + (end_hk.tv_nsec-start_hk.tv_nsec)/1000);
}
Housekeeping is the name of the single module and the outputs above the housekeeping output are for the individual sensor modules. The housekeeping module has been compiled and linked with the sensor modules without the main function, and the O2 optimization flag has been used during cross compilation. This difference in the times is same even if the time is measured by CLOCK_BOOTTIME to include kernel pre-emption.
Please comment if any more information is needed to debunk this mystery!

I would suspect something happening in the background, when you use library functions for the first time.
Try to disable lazy binding, for example, by setting environment variable LD_BIND_NOW = 1 (Is there a linker flag to force it to load all shared libraries at start time?)

Related

Which Linux kernel version support uffdio_writeprotect structure and how to compile and install that linux kernel?

I have try to compile a program which manage userfault-fd to collect some dirty pages on memory. In this program i have used the uffdio_writeprotect structure to survey a memory region where program try to access. I am using Ubuntu 18.04 with linux kernel version 5.14.0. But when i compile a program with the command gcc -o with_userfault -I. with_userfault.c
i still have the same errors such as: with_userfault.c:108:34: error: storage size of ‘wp’ isn’t known
struct uffdio_writeprotect wp;
with_userfault.c:114:21: error: ‘UFFDIO_WRITEPROTECT’ undeclared (first use in this function); did you mean ‘UFFDIO_REGISTER’?
if (ioctl(fd, UFFDIO_WRITEPROTECT, &wp) == -1)
I don't know how to solve this problem. Please I need your help to solve this issue because i don't know what i am suppose to do now. thanks!!!!!
this is a source code:
The header file:
#ifndef __RDTSC_H_DEFINED__
#define __RDTSC_H_DEFINED__
#if defined(__i386__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int x;
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
}
#elif defined(__x86_64__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#elif defined(__powerpc__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int result=0;
unsigned long int upper, lower,tmp;
__asm__ volatile(
"0: \n"
"\tmftbu %0 \n"
"\tmftb %1 \n"
"\tmftbu %2 \n"
"\tcmpw %2,%0 \n"
"\tbne 0b \n"
: "=r"(upper),"=r"(lower),"=r"(tmp)
);
result = upper;
result = result<<32;
result = result|lower;
return(result);
}
#else
#error "No tick counter is available!"
#endif
/* $RCSfile: $ $Author: kazutomo $
* $Revision: 1.6 $ $Date: 2005/04/13 18:49:58 $
*/
#endif
The source file
/*
* Example program about using userfaultfd(2) for garbage collection.
*
* This establishes a couple pages, all of which are filled from
* compressed files on disk when first accessed. For simplicity
* these are
* one file per page. Files are written at the beginning of the
* program.
*
* Later, this program demonstrates the use of write protection to
* get
* a notification on write access, analogous to using
* mprotect(!PROT_WRITE)
* and doing the bookkeeping in a SIGSEGV handler.
*
*/
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <time.h>
#include <math.h>
#include <unistd.h>
#include <strings.h>
#include <string.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <poll.h>
#include <pthread.h>
#include "rdtsc.h"
//#define size 102400
#define FILE_BUF 25
#define PAGE_SIZE sysconf(_SC_PAGE_SIZE)
#define n_pages 102400
//102400
#define iterations 1
pthread_t tracker;
void *tracker_task();
int i;
unsigned long main_tsc_start, main_tsc_end;
// This is doing the work in the uffd handler thread
void *tracker_task(void *data)
{
int fd = *(int *)(data);
for (;;)
{
struct uffd_msg msg;
struct pollfd pollfd;
int pollres, readret;
unsigned long addr, page_begin, whichpage;
unsigned long handler_tsc_start, handler_tsc_end;
pollfd.fd = fd;
pollfd.events = POLLIN;
pollres = poll(&pollfd, 1, -1);
if( pollres == -1 )
perror("poll");
if (pollfd.revents & POLLERR)
{
fprintf(stderr, "POLLERR on userfaultfd\n");
exit(1);
}
readret = read(fd, &msg, sizeof(msg));
if (readret == -1)
perror("read userfaultfd");
if (readret != sizeof(msg))
{
fprintf(stderr, "short read, not expected, exiting\n");
exit(1);
}
/*
* Proper sequence is important here.
*
* For the GC we expect that write-protected pages can only
* be pages already backed by physical pages.
* Regular writes into unprotected pages that come before
* reads need the page be filled.
*
* So we do the WP case first and get it out of the way.
* Then both of the other cases need the page read.
*/
if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)
{
// send write unlock
struct uffdio_writeprotect wp;
wp.range.start = msg.arg.pagefault.address;
wp.range.len = PAGE_SIZE;
wp.mode = 0;
//printf("sending !UFFDIO_WRITEPROTECT event to
//userfaultfd\n");
if (ioctl(fd, UFFDIO_WRITEPROTECT, &wp) == -1)
perror("ioctl(UFFDIO_WRITEPROTECT)");
//continue;
}
}
printf("end\n");
return NULL;
}
int main(int argc, char *argv[])
{
unsigned long *region;
int uffd, uffd_flags, expected, t_create;
void *status;
struct uffdio_writeprotect wp;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
char *clear = "3", *drop_caches_path = "/proc/sys/vm/drop_caches";
FILE *drop_caches_file;
main_tsc_start = rdtsc();
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
{
perror("syscall");
exit(2);
}
uffdio_api.api = UFFD_API;
uffdio_api.features = 1;
if (ioctl(uffd, UFFDIO_API, &uffdio_api))
{
fprintf(stderr, "UFFDIO_API\n");
return 1;
}
//printf("Features: 0x%llx\n", uffdio_api.features);
if (uffdio_api.api != UFFD_API)
{
fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
return 1;
}
/* Allocate memory that will be tracked */
region = (unsigned long *) mmap(NULL, PAGE_SIZE * n_pages, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (!region)
{
perror("mmap");
exit(2);
}
/* Force alignment of contiguous pages */
if (posix_memalign((void **)region, PAGE_SIZE, PAGE_SIZE * (n_pages - 1)))
{
fprintf(stderr, "cannot align by PAGE_SIZE %ld\n", PAGE_SIZE);
exit(1);
}
uffdio_register.range.start = (unsigned long)region;
uffdio_register.range.len = PAGE_SIZE * n_pages;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
//main_tsc_start = rdtsc();
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
{
perror("ioctl(UFFDIO_REGISTER)");
exit(1);
}
expected = UFFD_API_RANGE_IOCTLS;
if ((uffdio_register.ioctls & expected) != expected)
{
fprintf(stderr, "ioctl set is incorrect\n");
exit(1);
}
if( (t_create = pthread_create(&tracker, NULL, tracker_task,
&uffd)) != 0 )
{
errno = t_create;
perror("pthread_create");
}
//printf("mainline writing writable pages.\n");
for (unsigned long i = 0; i < n_pages; i++)
{
unsigned long entry = (i * PAGE_SIZE / sizeof(unsigned long)) + ((rand()%10000) % 512);//(i % 512);
region[entry] = i;
}
drop_caches_file = fopen(drop_caches_path, "w");
for (i = 0; i < iterations; i++)//
{
/* Indicate the range of pages to be write-protected */
//sync();
fwrite(clear, sizeof(clear), 1, drop_caches_file);
wp.range.start = (unsigned long long)region;
wp.range.len = PAGE_SIZE * n_pages;
wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
/* Write-protect pages */
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp) == -1)
{
perror("ioctl(UFFDIO_WRITEPROTECT)");
exit(1);
}
/* Now try to "touch" the pages to trigger page faults and handling by the tracker thread */
for (unsigned long i = 0; i < n_pages; i++)
{
unsigned long entry = (i * PAGE_SIZE / sizeof(unsigned long)) + ((rand()%10000) % 512);//(i % 512);
region[entry] = i;
}
}
fclose(drop_caches_file);
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
fprintf(stderr, "ioctl unregister failure\n");
return 1;
}
main_tsc_end = rdtsc();
printf("Apllication TSC : %lu\n", (unsigned long) (main_tsc_end-main_tsc_start));
return 0;
}

Time to read data in Linux is different in formatted disk and Disk with deleted data

Trying read calls on disks using read() command on 2 different kind of disks:
ssize_t read(int fd, void *buf, size_t count);
http://man7.org/linux/man-pages/man2/read.2.html
Newly Formatted Disk
Disk where data was inserted and Deleted
Environment:
Disk Size: 500 GB
Disk Type: SSD
OS: Ubuntu 18.04.3
File System: ext4
Avg Read Time for Formatted Disk (in microseconds): 127.11
Avg Read Time for Disk (Data was inserted and deleted) (in microseconds): 514.76
Script for above:
#define _LARGEFILE64_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include <signal.h>
#include <sys/time.h>
#include <sys/fcntl.h>
#include <sys/ioctl.h>
#include <linux/fs.h>
#define BLOCKSIZE 512
#define TIMEOUT 30
int count;
time_t start;
long total_seek_time = 0, total_read_time = 0;
int unread_complete_block_count = 0;
void done()
{
time_t end;
time(&end);
if (end < start + TIMEOUT)
{
printf(".");
alarm(1);
return;
}
if (count)
{
printf(".\nResults: %d seeks/second, %.2f ms random access time\n",
count / TIMEOUT, 1000.0 * TIMEOUT / count);
printf("Total seek time: %ld, Avg seek time: %.2f\n", total_seek_time, 1.0 * total_seek_time / count);
printf("Total read time: %ld, Avg read time: %.2f\n", total_read_time, 1.0 * total_read_time / count);
printf("Unread Complete Block Count: %d\n", unread_complete_block_count);
}
exit(EXIT_SUCCESS);
}
void handle(const char *string, int error)
{
if (error)
{
perror(string);
exit(EXIT_FAILURE);
}
}
long getMicrotime()
{
struct timeval currentTime;
gettimeofday(&currentTime, NULL);
return currentTime.tv_sec * (int)1e6 + currentTime.tv_usec;
}
int main(int argc, char **argv)
{
char buffer[BLOCKSIZE];
int fd, retval;
unsigned long numblocks;
off64_t offset;
long seek_start, seek_end, read_start, read_end;
int buf_ret_val = setvbuf(stdout, NULL, _IONBF, 0);
if (buf_ret_val == 0)
{
printf("Buffer successfully allocated\n");
}
else
{
printf("Unable to allocate buffer. Error code, %d\n", buf_ret_val);
exit(EXIT_FAILURE);
}
printf("Seeker v2.0, 2007-01-15, "
"http://www.linuxinsight.com/how_fast_is_your_disk.html\n");
if (argc != 2)
{
printf("Usage: seeker <raw disk device>\n");
exit(EXIT_SUCCESS);
}
fd = open(argv[1], O_RDONLY);
printf("File Descriptor: %d\n", fd);
handle("open", fd < 0);
printf("Device size in sectors: %u\n", BLKGETSIZE);
printf("Device size in bytes: %lu\n", BLKGETSIZE64);
retval = ioctl(fd, BLKGETSIZE, &numblocks);
handle("ioctl", retval == -1);
printf("Benchmarking %s [%luMB], wait %d seconds", argv[1], numblocks / 2048, TIMEOUT);
time(&start);
srand(start);
signal(SIGALRM, &done);
alarm(1);
for (;;)
{
offset = (off64_t)numblocks * random() / RAND_MAX;
seek_start = getMicrotime();
retval = lseek64(fd, BLOCKSIZE * offset, SEEK_SET);
seek_end = getMicrotime();
handle("lseek64", retval == (off64_t)-1);
read_start = getMicrotime();
retval = read(fd, buffer, BLOCKSIZE);
read_end = getMicrotime();
handle("read", retval < 0);
long current_seek_time = seek_end - seek_start;
long current_read_time = read_end - read_start;
// printf("Current seek time (us): %ld\n", current_seek_time);
// printf("Current read time (us): %ld\n", current_read_time);
if (retval != BLOCKSIZE)
{
printf("Didn't read complete block");
unread_complete_block_count++;
}
total_seek_time += current_seek_time;
total_read_time += current_read_time;
count++;
}
}
Compile: gcc {name}.c
Run: ./a.out /dev/{sdx}
Need to understand why is it happening?
An empty SSD can "know" it's empty, after TRIM, so it's like reading a sparse file. Data has to go over the SATA bus, but (I think) not necessarily come from flash cells at all.
So yes it matters what you did to the block device. The flash remapping layer inside the SSD is firmware running on a fairly powerful microcontroller with lots of RAM.

Time consumption of Linux RS485 serial communication

I'm trying to communicate with several Modbus RTU devices through one usb to RS232 to RS485 port at baudrate 38400, 1 start bit, 8databits, no parity and 1 stop bit.
Communication processes with one Modbus RTU device is as follows:
Send 8 bytes to the device;
wait for replies from the device;
Receive 23 bytes of replies.
According to my calculation and the digital oscilloscope, send 8 bytes costs 2.083ms, receive 23 bytes costs 5.99ms, response time of the Modbus RTU device is about 1.3ms. So time of the communication process costs 9.373ms in total.
But in my test program I found the average communication time is about 15ms (10000 times average). I wonder where does the additional 5 more milliseconds come from and how could I optimize my program to reduce this time.
Thanks in advance!
The test program is as follows:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <termios.h>
#include <errno.h>
#include <string.h>
#include <signal.h>
void print_hex_buf(unsigned char *buffer, int size)
{
for (int i=0; i<size; i++)
{
printf("%02x ", buffer[i]);
}
printf("\n");
}
void diff_time(struct timeval t1, struct timeval t2, struct timeval *diff)
{
time_t sec;
suseconds_t usec;
//time in two different days
if (t1.tv_sec > t2.tv_sec)
sec = t2.tv_sec + 24*60*60 - t1.tv_sec;
else
sec = t2.tv_sec - t1.tv_sec;
usec = t2.tv_usec - t1.tv_usec;
if (usec < 0)
{
sec -= 1;
usec += 1000000;
}
diff->tv_sec = sec;
diff->tv_usec = usec;
}
int serial_write(int uart_fd, char *buffer, int size)
{
int count = 0;
count = write(uart_fd, buffer, size);
return count;
}
int serial_read(int uart_fd, char *buffer, int size)
{
int count = 0;
int bytes_read = 0;
int read_retry = 0;
fd_set fds_read;
struct timeval timeout;
FD_ZERO(&fds_read);
FD_SET(uart_fd, &fds_read);
timeout.tv_sec = 0;
timeout.tv_usec = 500000; //500ms
int ret = select(uart_fd + 1, &fds_read, NULL, NULL, &timeout);
if (ret > 0 && FD_ISSET(uart_fd, &fds_read))
{
count = read(uart_fd, buffer, size);
bytes_read = (count > 0)?count:0;
while (bytes_read < size && read_retry++ < 500)
{
count = read(uart_fd, buffer+bytes_read, size-bytes_read);
bytes_read += (count > 0)?count:0;
if (bytes_read >= size)
break;
}
}
else
{
printf("Failed to from uart!\n");
return -1;
}
return bytes_read;
}
int main(int argc, char** argv)
{
int fd;
struct termios opt;
int count;
unsigned char send_buf[] = { 0x01, 0x04, 0x00, 0x00, 0x00, 0x09, 0x30, 0x0c};
unsigned char buffer[256];
int iteration = 0;
int delay_ms = 0;
int err_count = 0;
int cycle = 0;
suseconds_t average_time = 0;
setbuf(stdout, NULL);
if (argc != 3)
{
printf("Usage: testuart [uart device] [iteration]\n");
return 0;
}
iteration = atoi(argv[2]);
fd = open(argv[1], O_RDWR | O_NOCTTY | O_NDELAY);
if (fd == -1)
{
printf("Failed to open port: %s\n", argv[1]);
return -1;
}
if (tcgetattr(fd, &opt) != 0)
{
printf("Failed to get uart attribute!\n");
return -1;
}
opt.c_cflag = B38400|CS8|CREAD|CLOCAL;
opt.c_iflag = IGNPAR;
opt.c_cflag &= ~PARENB;
opt.c_cflag &= ~PARODD;
opt.c_lflag &= ~(ICANON | ECHO | ECHOE | ISIG);
opt.c_oflag &= ~OPOST;
opt.c_iflag &= ~(IGNBRK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON);
tcflush(fd, TCIFLUSH);
if (tcsetattr(fd, TCSANOW, &opt) != 0)
{
printf("Failed to setup serial port!\n");
close(fd);
return -1;
}
while (cycle++ < iteration)
{
printf("Send hex command:\n");
print_hex_buf(send_buf, 8);
struct timeval tm_start;
struct timeval tm_end;
struct timeval tm_diff;
gettimeofday(&tm_start, NULL);
count = serial_write(fd, send_buf, 8);
if (count != 8)
{
printf("Failed to write 8 bytes!\n");
close(fd);
return -1;
}
count = serial_read(fd, buffer, 23);
if (count <= 0)
{
printf("serial read returns %d\n", count);
close(fd);
return -1;
}
gettimeofday(&tm_end, NULL);
diff_time(tm_start, tm_end, &tm_diff);
print_hex_buf(buffer, count);
printf("serial communication costs %ld.%06ld seconds.\n",
tm_diff.tv_sec, tm_diff.tv_usec);
average_time = ((average_time*(cycle-1))+tm_diff.tv_usec)/cycle;
}
printf("%d times, average time in usec is %ld\n", cycle-1, average_time);
close(fd);
return 0;
}
Thanks to sawdust!
The following link helps! The average time has reduced from 15ms to 10ms.
High delay in RS232 communication on a PXA270

Clock frequency setting doesn't change simulation speed

I'm trying to run the following AVR program on SimAVR:
#include <avr/io.h>
#include <util/delay.h>
int main ()
{
DDRB |= _BV(DDB5);
for (;;)
{
PORTB ^= _BV(PB5);
_delay_ms(2000);
}
}
I've compiled it with F_CPU=16000000. The SimAVR runner is as follows:
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include "sim_avr.h"
#include "avr_ioport.h"
#include "sim_elf.h"
avr_t * avr = NULL;
static void* avr_run_thread(void * ignore)
{
for (;;) {
avr_run(avr);
}
return NULL;
}
void led_changed_hook(struct avr_irq_t* irq, uint32_t value, void* param)
{
printf("led_changed_hook %d %d\n", irq->irq, value);
}
int main(int argc, char *argv[])
{
elf_firmware_t f;
elf_read_firmware("image.elf", &f);
f.frequency = 16e6;
const char *mmcu = "atmega328p";
avr = avr_make_mcu_by_name(mmcu);
if (!avr) {
fprintf(stderr, "%s: AVR '%s' not known\n", argv[0], mmcu);
exit(1);
}
avr_init(avr);
avr_load_firmware(avr, &f);
avr_irq_register_notify(
avr_io_getirq(avr, AVR_IOCTL_IOPORT_GETIRQ('B'), 5),
led_changed_hook,
NULL);
pthread_t run;
pthread_create(&run, NULL, avr_run_thread, NULL);
for (;;) {}
}
The problem is that I see from the output of led_changed_hook that it runs at ~4x speed. Moreover, changing f.frequency doesn't seem to have any effect on the simulation speed whatsoever.
How do I ensure that SimAVR runs the simulation at the correct real-time speed?
It turns out SimAVR doesn't support timing-accurate simulation of opcodes so the simulation time of running the busy-wait of _delay_ms to completion is completely unrelated to
how long it would take on the real MCU
the clock frequency of the simulated MCU
The correct solution is to use a timer interrupt, and then go to sleep on the MCU. The simulator will correctly simulate the timer counters and the sleep will suspend the simulation until the timer fires.
#include <avr/interrupt.h>
#include <avr/power.h>
#include <avr/sleep.h>
int main ()
{
DDRB |= _BV(DDB5);
TCCR1A = 0;
TCCR1B = 0;
TCNT1 = 0;
TIMSK1 |= (1 << OCIE1A);
sei();
/* Set TIMER1 to 0.5 Hz */
TCCR1B |= (1 << WGM12);
OCR1A = 31248;
TCCR1B |= ((1 << CS12) | (1 << CS10));
set_sleep_mode(SLEEP_MODE_IDLE);
sleep_enable();
for (;;)
{
sleep_mode();
}
}
ISR(TIMER1_COMPA_vect){
PORTB ^= _BV(PB5);
}

Zero copy in using vmsplice/splice in Linux

I am trying to get zero copy semantics working in linux using
vmsplice()/splice() but I don't see any performance improvement. This
is on linux 3.10, tried on 3.0.0 and 2.6.32. The following code tries
to do file writes, I have tried network socket writes() also, couldn't
see any improvement.
Can somebody tell what am I doing wrong ?
Has anyone gotten improvement using vmsplice()/splice() in production ?
#include <assert.h>
#include <fcntl.h>
#include <iostream>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>
#include <vector>
const char *filename = "Test-File";
const int block_size = 4 * 1024;
const int file_size = 4 * 1024 * 1024;
using namespace std;
int pipes[2];
vector<char *> file_data;
static int NowUsecs() {
struct timeval tv;
const int err = gettimeofday(&tv, NULL);
assert(err >= 0);
return tv.tv_sec * 1000000LL + tv.tv_usec;
}
void CreateData() {
for (int xx = 0; xx < file_size / block_size; ++xx) {
// The data buffer to fill.
char *data = NULL;
assert(posix_memalign(reinterpret_cast<void **>(&data), 4096, block_size) == 0);
file_data.emplace_back(data);
}
}
int SpliceWrite(int fd, char *buf, int buf_len) {
int len = buf_len;
struct iovec iov;
iov.iov_base = buf;
iov.iov_len = len;
while (len) {
int ret = vmsplice(pipes[1], &iov, 1, SPLICE_F_GIFT);
assert(ret >= 0);
if (!ret)
break;
len -= ret;
if (len) {
auto ptr = static_cast<char *>(iov.iov_base);
ptr += ret;
iov.iov_base = ptr;
iov.iov_len -= ret;
}
}
len = buf_len;
while (len) {
int ret = splice(pipes[0], NULL, fd, NULL, len, SPLICE_F_MOVE);
assert(ret >= 0);
if (!ret)
break;
len -= ret;
}
return 1;
}
int WriteToFile(const char *filename, bool use_splice) {
// Open and write to the file.
mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
int fd = open(filename, O_CREAT | O_RDWR, mode);
assert(fd >= 0);
const int start = NowUsecs();
for (int xx = 0; xx < file_size / block_size; ++xx) {
if (use_splice) {
SpliceWrite(fd, file_data[xx], block_size);
} else {
assert(write(fd, file_data[xx], block_size) == block_size);
}
}
const int time = NowUsecs() - start;
// Close file.
assert(close(fd) == 0);
return time;
}
void ValidateData() {
// Open and read from file.
const int fd = open(filename, O_RDWR);
assert(fd >= 0);
char *read_buf = (char *)malloc(block_size);
for (int xx = 0; xx < file_size / block_size; ++xx) {
assert(read(fd, read_buf, block_size) == block_size);
assert(memcmp(read_buf, file_data[xx], block_size) == 0);
}
// Close file.
assert(close(fd) == 0);
assert(unlink(filename) == 0);
}
int main(int argc, char **argv) {
auto res = pipe(pipes);
assert(res == 0);
CreateData();
const int without_splice = WriteToFile(filename, false /* use splice */);
ValidateData();
const int with_splice = WriteToFile(filename, true /* use splice */);
ValidateData();
cout << "TIME WITH SPLICE: " << with_splice << endl;
cout << "TIME WITHOUT SPLICE: " << without_splice << endl;
return 0;
}
I did a proof-of-concept some years ago where I got as 4x speedup using an optimized, specially tailored, vmsplice() code. This was measured against a generic socket/write() based solution. This blog post from natsys-lab echoes my findings. But I believe you need to have the exact right use case to get near this number.
So what are you doing wrong? Primarily I think you are measuring the wrong thing. When writing directly to a file you have 1 system call, which is write(). And you are not actually copying data (except to the kernel). When you have a buffer with data that you want to write to disk, it's not gonna get faster than that.
In you vmsplice/splice setup you are still copying you data into the kernel, but you have a total of 2 system calls vmsplice()+splice() to get it to disk. The speed being identical to write() is probably just a testament to Linux system call speed :-)
A more "fair" setup would be to write one program that read() from stdin and write() the same data to stdout. Write an identical program that simply splice() stdin into a file (or point stdout to a file when you run it). Although this setup might be too simple to really show anything.
Aside: an (undocumented?) feature of vmsplice() is that you can also use to to read data from a pipe. I used this in my old POC. It was basically just an IPC layer based on the idea of passing memory pages around using vmsplice().
Note: NowUsecs() probably overflows the int

Resources