Why did NOT my Linux act the lazy memory allocation? - linux

I'm practising to use the Lazy Allocation and the Demand Paging policies of Linux.
I want a buffer that I allocated by mmap() occupy NO physical memory until I really write something to it.
Further more, I want it gradually enlarge (use more physical memory) with a step size of the swap page size (e.g. 4K) of Linux along with I'm writing continuously from its head to the tail.
According to some docs and searchings, it should NOT enlarge if there be only reading access on it, but the reality I observed in a experiment does NOT like this.
To test this, I coded a program as following, and watched the memory status by top shell command when it running.
constexpr size_t BUF_SIZE = 1024 * 1024 * 1024;
int main( int argc, char** argv ) {
auto shm_pt = mmap( NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0 );
if( shm_pt == MAP_FAILED ) {
std::cerr << "mmap error:" << shm_pt;
exit( EXIT_FAILURE );
};
bool full_zero = true;
uint8_t* pc = reinterpret_cast<uint8_t*>( shm_pt );
constexpr size_t STEP_SIZE = 1024 * 1024;
for( size_t j = 0; j < BUF_SIZE / STEP_SIZE; ++j ) {
this_thread::sleep_for( 100ms );
size_t base = j * STEP_SIZE;
std::cerr << "Reading from " << base / 1024 / 1024 << "M..." << endl;
for( size_t i = 0; i < STEP_SIZE; ++i )
full_zero = full_zero && pc[ base + i ] == 0;
}
if( !full_zero )
std::cerr << "The buffer has not been initialized with full zeros!";
for( size_t j = 0; j < BUF_SIZE / STEP_SIZE; ++j ) {
this_thread::sleep_for( 100ms );
size_t base = j * STEP_SIZE;
std::cerr << "Writing to " << base / 1024 / 1024 << "M..." << endl;
for( size_t i = 0; i < STEP_SIZE; ++i )
pc[ base + i ] = 'c';
}
munmap( shm_pt, BUF_SIZE );
return EXIT_SUCCESS;
};
What I observed is that the physical memory used by my app is growing gradually along with the Reading operation not with Writing op!
Perhaps my comprehension is wrong?

I got it!
In the searching content I pasted, that man used a MAP_PRIVATE flag to mmap() as argument, while I used MAP_SHARED.
It looks like that if a buffer is being shared between processes, a READING operation also results real memory allocation!

Related

How does limits on the shared memory work on Linux

I was looking into the Linux kernel limits on the shared memory
/proc/sys/kernel/shmall
specifies the maximum amount of pages that can be allocated. Considering this number as x and the page size as p. I assume that "x * p" bytes is the limit on the system wide shared memory.
Now I wrote a small program to create a shared memory segment and i attached to that shared memory segment twice as below
shm_id = shmget(IPC_PRIVATE, 4*sizeof(int), IPC_CREAT | 0666);
if (shm_id < 0) {
printf("shmget error\n");
exit(1);
}
printf("\n The shared memory created is %d",shm_id);
ptr = shmat(shm_id,NULL,0);
ptr_info = shmat(shm_id,NULL,0);
In the above program ptr and ptr_info were different. So the shared memory is mapped to 2 virtual addresses in my process address space.
When I do an ipcs it looks like this
...
0x00000000 1638416 sun 666 16000000 2
...
Now coming to the shmall limit x * p noted above in my question. Is this limit applicable on the sum of all the virtual memory allocated for every shared memory segment? or does this limit apply on the physical memory?
Physical memory is only one here (shared memory) and from the program above when I do 2 shmat's there is twice the amount of memory allocated in my process address space. So this limit will hit soon if do continuous shmat's on a single shared memory segment?
The limit only applies to physical memory, that is the real shared memory allocated for all segments, because shmat() just maps that allocated segment into process address space.
You can trace it in the kernel, there is only one place where this limit is checked — in the newseg() function that allocates new segments (ns->shm_ctlall comparison). shmat() implementation is busy with a lot of stuff, but doesn't care at all about shmall limit, so you can map one segment as many times as you want to (well, address space is also limited, but in practice you rarely care about this limit).
You can also try some test from userspace with a simple program like this one:
#define _GNU_SOURCE
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <unistd.h>
unsigned long int get_shmall() {
FILE *f = NULL;
char buf[512];
unsigned long int value = 0;
if ((f = fopen("/proc/sys/kernel/shmall", "r")) != NULL) {
if (fgets(buf, sizeof(buf), f) != NULL)
value = strtoul(buf, NULL, 10); // no proper checks
fclose(f); // no return value check
}
return value;
}
int set_shmall(unsigned long int value) {
FILE *f = NULL;
char buf[512];
int retval = 0;
if ((f = fopen("/proc/sys/kernel/shmall", "w")) != NULL) {
if (snprintf(buf, sizeof(buf), "%lu\n", value) >= sizeof(buf) ||
fwrite(buf, 1, strlen(buf), f) != strlen(buf))
retval = -1;
fclose(f); // fingers crossed
} else
retval = -1;
return retval;
}
int main()
{
int shm_id1 = -1, shm_id2 = -1;
unsigned long int shmall = 0, shmused, newshmall;
void *ptr1, *ptr2;
struct shm_info shminf;
if ((shmall = get_shmall()) == 0) {
printf("can't get shmall\n");
goto out;
}
printf("original shmall: %lu pages\n", shmall);
if (shmctl(0, SHM_INFO, (struct shmid_ds *)&shminf) < 0) {
printf("can't get SHM_INFO\n");
goto out;
}
shmused = shminf.shm_tot * getpagesize();
printf("shmused: %lu pages (%lu bytes)\n", shminf.shm_tot, shmused);
newshmall = shminf.shm_tot + 1;
if (set_shmall(newshmall) != 0) {
printf("can't set shmall\n");
goto out;
}
if (get_shmall() != newshmall) {
printf("something went wrong with shmall setting\n");
goto out;
}
printf("new shmall: %lu pages (%lu bytes)\n", newshmall, newshmall * getpagesize());
printf("shmget() for %u bytes: ", (unsigned int) getpagesize());
shm_id1 = shmget(IPC_PRIVATE, (size_t)getpagesize(), IPC_CREAT | 0666);
if (shm_id1 < 0) {
printf("failed: %s\n", strerror(errno));
goto out;
}
printf("ok\nshmat 1: ");
ptr1 = shmat(shm_id1, NULL, 0);
if (ptr1 == 0) {
printf("failed\n");
goto out;
}
printf("ok\nshmat 2: ");
ptr2 = shmat(shm_id1, NULL, 0);
if (ptr2 == 0) {
printf("failed\n");
goto out;
}
printf("ok\n");
if (ptr1 == ptr2) {
printf("ptr1 and ptr2 are the same with shm_id1\n");
goto out;
}
printf("shmget() for %u bytes: ", (unsigned int) getpagesize());
shm_id2 = shmget(IPC_PRIVATE, (size_t)getpagesize(), IPC_CREAT | 0666);
if (shm_id2 < 0)
printf("failed: %s\n", strerror(errno));
else
printf("ok, although it's wrong\n");
out:
if (shmall != 0 && set_shmall(shmall) != 0)
printf("failed to restrore shmall\n");
if (shm_id1 >= 0 && shmctl(shm_id1, IPC_RMID, NULL) < 0)
printf("failed to remove shm_id1\n");
if (shm_id2 >= 0 && shmctl(shm_id2, IPC_RMID, NULL) < 0)
printf("failed to remove shm_id2\n");
return 0;
}
What is does is it sets the shmall limit just one page above what is currently used by the system, then tries to get page-sized new segment and map it twice (all successfully), then tries to get one more page-sized segment and fails to do that (execute the program as superuser because it writes to /proc/sys/kernel/shmall):
$ sudo ./a.out
original shmall: 18446744073708503040 pages
shmused: 21053 pages (86233088 bytes)
new shmall: 21054 pages (86237184 bytes)
shmget() for 4096 bytes: ok
shmat 1: ok
shmat 2: ok
shmget() for 4096 bytes: failed: No space left on device
I did not find any Physical memory allocation at do_shmat function (linux/ipc/shm.c)
https://github.com/torvalds/linux/blob/5469dc270cd44c451590d40c031e6a71c1f637e8/ipc/shm.c
so shmat consumes only vm (your process address space),
the main function of shmat is mmap

zLib transparent write mode "wT" performance degradation

I would expect zLib transparent mode ( gzptintf() ) as fast as regular fprintf(). I found zLib gzprintf() with "wT" is 2.5x slower than fprintf(). Is there any workaround on this performance issue?
Details:
I’m using libz.so.1.2.8 on Linux (fedora 22, kernel 4.0.5, Intel(R) Core(TM) i7-3770 CPU # 3.40GHz) to provide output file compress option to my event trace collector. To keep legacy compatibility I need transparent file format writing mode.
As I see, the option “T” in gzopen allow to write files with no compression and no gzip header record.
The problem is in performance. The transparent mode is ~2.5x slower than simple standard fprintf.
Here is quick test result (values are in TSC):
zLib]$ ./zlib_transparent
Performance fprintf vs gzprintf (transparent):
fprintf 22883026324
zLib transp 62305122876
ratio 2.72277
The source for this test:
#include <stdio.h>
#include <zlib.h>
#include <iostream>
#include <sstream>
#include <iomanip>
#define NUMITERATIONS 10000000
static double buffer[NUMITERATIONS];
static __inline__ unsigned long long rdtsc(void){
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
long long test_fprintf(double *buffer){
long long t = rdtsc();
#ifdef USE_FPRINTF
double tmp = 0;
FILE *file = fopen("fprintf_file.txt", "w");
for (int i = 0; i < NUMITERATIONS; ++i) {
fprintf(file, "[%f:%f]\n", buffer[i], buffer[i] - tmp);
tmp = buffer[i] + i;
}
fclose(file);
#endif
return rdtsc() - t;
}
long long test_zlib_transparent(double *buffer){
long long t = rdtsc();
#ifdef USE_ZLIB
double tmp = 0;
gzFile file = gzopen("zlib_file.txt.gz", "wT");
for (int i = 0; i < NUMITERATIONS; ++i) {
gzprintf(file, "[%f:%f]\n", buffer[i], buffer[i] - tmp);
tmp = buffer[i] + i;
}
gzclose(file);
#endif
return rdtsc() - t;
}
int main(){
std::cout << "Performance fprintf vs gzprintf (transparent):" << std::endl;
long long dPrint = test_fprintf(buffer);
std::cout << " fprintf " << dPrint << std::endl;
long long dStream = test_zlib_transparent(buffer);
std::cout << "zLib transp " << dStream << std::endl;
std::cout << "ratio " << double(dStream)/double(dPrint) << std::endl;
return 0;
}
Build:
g++ -g -O3 -DUSE_ZLIB=1 -DUSE_FPRINTF=1 zlib_transparent.cpp -o zlib_transparent –lz
Thank you
Sergey
My bad. (I wrote gzprintf().)
write() is being called too often. You will get approximately the same performance as zlib if you replace fprintf() with snprintf() and write().
I will improve this in the next version of zlib. If you would like to try it, apply this diff. I don't know how it will perform on Linux, but on Mac OS X, gzprintf() in transparent mode is now 10% faster than fprintf(). (Wasn't expecting that.)

Interpreting time command output on a multi threaded program

I have a multi threaded program and I am profiling time taken starting before all pthread_create's and after all pthread_join's.
Now I find that this time, lets call it X, which is shown below in "Done in xms" is actually user + sys time of time output. In my app the number argument to a.out controls how many threads to spawn. ./a.out 1 spawn 1 pthread and ./a.out 2 spawns 2 threads where each thread does the same amount of work.
I was expecting X to be the real time instead of user + sys time. Can someone please tell me why this is not so? Then this really means my app is indeed running parallel without any locking between threads.
[jithin#whatsoeverclever tests]$ time ./a.out 1
Done in 320ms
real 0m0.347s
user 0m0.300s
sys 0m0.046s
[jithin#whatsoeverclever tests]$ time ./a.out 2
Done in 450ms
real 0m0.266s
user 0m0.383s
sys 0m0.087s
[jithin#whatsoeverclever tests]$ time ./a.out 3
Done in 630ms
real 0m0.310s
user 0m0.532s
sys 0m0.105s
Code
int main(int argc, char **argv) {
//Read the words
getWords();
//Set number of words to use
int maxWords = words.size();
if(argc > 1) {
int numWords = atoi(argv[1]);
if(numWords > 0 && numWords < maxWords) maxWords = numWords;
}
//Init model
model = new Model(MODEL_PATH);
pthread_t *threads = new pthread_t[maxWords];
pthread_attr_t attr;
void *status;
// Initialize and set thread joinable
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
int rc;
clock_t startTime = clock();
for(unsigned i=0; i<maxWords; i++) {
//create thread
rc = pthread_create(&threads[i], NULL, processWord, (void *)&words[i] );
if (rc){
cout << "Error:unable to create thread: " << i << "," << rc << endl;
exit(-1);
}
}
// free attribute and wait for the other threads
pthread_attr_destroy(&attr);
for(unsigned i=0; i<maxWords; i++) {
rc = pthread_join(threads[i], &status);
if (rc){
cout << "Error:unable to join thread: " << i << "," << rc << endl;
exit(-1);
}
}
clock_t endTime = clock();
float diff = (((float)endTime - (float)startTime) / 1000000.0F ) * 1000;
cout<<"Done in "<< diff << "ms\n";
delete[] threads;
delete model;
}
The clock function is specifically documented to return the processor time used by a process. If you want to measure wall time elapsed, it's not the right function.

Why the physical memory in Linux is allocated linearly increased rather than at once?

I had wrote a program as below which allocated about 1.2G memory at once, and I tested it on Linux. Then I found
If I defined the macro *WRITE_MEM*, the physical memory usage (inspected by the command top) will increase linearly.
If I didn't define the macro, the physical memory usage is very small (about hundreds of kilobytes) and not changed verly large.
I dont's understand the phenomenon.
#include <iostream>
#include <cmath>
#include <cstdlib>
using namespace std;
float sum = 0.;
int main (int argc, char** argv)
{
float* pf = (float*) malloc(1024*1024*300*4);
float* p = pf;
for (int i = 0; i < 300; i++) {
cout << i << "..." << endl;
float* qf = (float *) malloc(1024*1024*4);
float* q = qf;
for (int j = 0; j < 1024*1024; j++) {
*q++ = sin(j*j*j*j) ;
}
q = qf;
for (int j = 0; j < 1024*1024; j++) {
#ifdef WRITE_MEM // The physical memory usage will increase linearly
*p++ = *q++;
sum += *q;
#else // The physical memory usage is small and will not change
p++;
// or
// sum += *p++;
#endif
}
free(qf);
}
free(pf);
return 0;
}
Linux allocates virtual memory immediately, but doesn't back it with physical memory until the pages are actually used. This causes processes to only use the physical memory they actually require, leaving the unused memory available for the rest of the system.

mmap slower than ioremap

I am developing for an ARM device running Linux 2.6.37. I am trying to toggle an IO pin as fast as possible. I made a little kernel module and a user space application. I tried two things :
Manipulate the GPIO control registers directly from the kernel space using ioremap.
mmap() the GPIO control registers without caching and using them from user space.
Both methods work, but the second is about 3 times slower than the first (observed on oscilloscope). I think I disabled all caching mechanisms.
Of course I'd like to get the best of the two worlds : flexibility and ease of development from user space with the speed of kernel space.
Does anybody know why the mmap() could be slower than the ioremap() ?
Here's my code :
Kernel module code
static int ti81xx_usmap_mmap(struct file* pFile, struct vm_area_struct* pVma)
{
pVma->vm_flags |= VM_RESERVED;
pVma->vm_page_prot = pgprot_noncached(pVma->vm_page_prot);
if (io_remap_pfn_range(pVma, pVma->vm_start, pVma->vm_pgoff,
pVma->vm_end - pVma->vm_start, pVma->vm_page_prot))
return -EAGAIN;
pVma->vm_ops = &ti81xx_usmap_vm_ops;
return 0;
}
static void ti81xx_usmap_test_gpio(void)
{
u32* pGpIoRegisters = ioremap_nocache(TI81XX_GPIO0_BASE, 0x400);
const u32 pin = 1 << 24;
int i;
/* I should use IO read/write functions instead of pointer deferencing,
* but portability isn't the issue here */
pGpIoRegisters[OMAP4_GPIO_OE >> 2] &= ~pin; /* Set pin as output*/
for (i = 0; i < 200000000; ++i)
{
pGpIoRegisters[OMAP4_GPIO_SETDATAOUT >> 2] = pin;
pGpIoRegisters[OMAP4_GPIO_CLEARDATAOUT >> 2] = pin;
}
pGpIoRegisters[OMAP4_GPIO_OE >> 2] |= pin; /* Set pin as input*/
iounmap(pGpIoRegisters);
}
User space application code
int main(int argc, char** argv)
{
int file, i;
ulong* pGpIoRegisters = NULL;
ulong pin = 1 << 24;
file = open("/dev/ti81xx-usmap", O_RDWR | O_SYNC);
if (file < 0)
{
printf("open failed (%d)\n", errno);
return 1;
}
printf("Toggle from kernel space...");
fflush(stdout);
ioctl(file, TI81XX_USMAP_IOCTL_TEST_GPIO);
printf(" done\n");
pGpIoRegisters = mmap(NULL, 0x400, PROT_READ | PROT_WRITE, MAP_SHARED, file, TI81XX_GPIO0_BASE);
printf("Toggle from user space...");
fflush(stdout);
pGpIoRegisters[OMAP4_GPIO_OE >> 2] &= ~pin;
for (i = 0; i < 30000000; ++i)
{
pGpIoRegisters[OMAP4_GPIO_SETDATAOUT >> 2] = pin;
pGpIoRegisters[OMAP4_GPIO_CLEARDATAOUT >> 2] = pin;
}
pGpIoRegisters[OMAP4_GPIO_OE >> 2] |= pin;
printf(" done\n");
fflush(stdout);
munmap(pGpIoRegisters, 0x400);
close(file);
return 0;
}
This is because ioremap_nocache() still enables the CPU write buffer in your VM mapping whereas pgprot_noncached() disables both bufferability and cacheability.
Apples to apples comparison would be to use ioremap_strongly_ordered() instead.
My guess would be that since mmap has to check to make sure you're writing to memory you're allowed to write to, it's going to be slower than the kernel version (which I believe/assume doesn't do that kind of checking--with a kernel module you're responsible for testing until you're very sure you're not breaking things).
Try using do_mmap (I believe that's the one) to use mmap from kernel space, and see how that compares. If it's comparably faster, then I'm right. If it's not, it's something else.

Resources