why the fd still available after shm_unlink() - linux

I'm reading the source code in https://wayland-book.com/surfaces/shared-memory.html .
The author create a shared memory using shm_open(), and shm_unlink() it immediately, then ftruncate() the fd to a specific size, mmap() the fd and fill the region with pixels.
I'm so confused why the fd still available after shm_unlink().
according to the man page:
The operation of shm_unlink() is analogous to unlink(2): it removes a shared memory object name, and, once all processes have unmapped the object, de-allocates and destroys the contents of the associated memory region. After a successful shm_unlink(), attempts to shm_open() an object with the same name will fail (unless O_CREAT was specified, in which case a new, distinct object is created).
so shm_unlink() will cause the memory destroyed because there is no process mmap
the region. But how fd still avaliable?
here is the code:
static int
create_shm_file(void)
{
int retries = 100;
do {
char name[] = "/wl_shm-XXXXXX";
randname(name + sizeof(name) - 7);
--retries;
int fd = shm_open(name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
if (fd >= 0) {
shm_unlink(name); // unlink immediately
return fd;
}
} while (retries > 0 && errno == EEXIST);
return -1;
}
static int
allocate_shm_file(size_t size)
{
int fd = create_shm_file();
if (fd < 0)
return -1;
int ret;
do {
ret = ftruncate(fd, size); //why the fd still available?
} while (ret < 0 && errno == EINTR);
if (ret < 0) {
close(fd);
return -1;
}
return fd;
}
//after above, there was mmap
uint32_t *data = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

Related

mmap failed to allocate virtual memory

I got the following output in ftrace:
mmap(0x200000000000, 17179869184, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
My code:
void alloc_page_full_reverse()
{
printf("Allocating default pagesize pages > 128TB \n");
mmap_chunks_higher(24575, 0);
printf("Allocating default pagesize pages < 128TB \n");
/* Note: Allocating a 16GB chunk less due to heap space required
for other mappings */
mmap_chunks_lower(8190, 0);
}
int mmap_chunks_higher(unsigned long no_of_chunks, unsigned long hugetlb_arg)
{
unsigned long i;
char *hptr;
char *hint;
int mmap_args = 0;
for (i = 0; i < no_of_chunks; i++){
hint = hind_addr();
hptr = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | hugetlb_arg, -1, 0); // MAP_CHUNK_SIZE = 16GB
if (hptr == MAP_FAILED){
printf("\n Map failed at address %p < 384TB in iteration = %d \n", hptr, i);
exit(-1);
}
if (validate_addr(hptr, 1)){
printf("\n Address failed, not in > 128Tb iterator = %d\n", i);
exit(-1);
}
}
printf("> 128Tb: \n chunks allocated= %d \n", i);
}
static char *hind_addr(void)
{
int bits = 48 + rand() % 15;
return (char *) (1UL << bits);
}
Need to understand before mmap how to validate **void mmap(void addr, size_t length, int prot, int flags, int fd, off_t offset); all its argument are validated,
EX: size_t length is validated.
I still want to make sure I have enough memory before doing a mmap
There isn't an interface that allows a process to check this, and for good reason. Suppose such a syscall existed, and the kernel told a process it could allocate 1 GB of memory. However, it is possible the kernel is not able to allocate that memory by the time process actually requests the allocation. So, this information would not be useful.
Instead, you should attempt to allocate memory, and handle ENOMEM.

resizing file size with ftruncate() after mmap()

The code snippet works fine on my machine(Linux/x86-64)
int main()
{
char* addr;
int rc;
int fd;
const size_t PAGE_SIZE = 4096; // assuming the page size is 4096
char buf[PAGE_SIZE];
memset(buf, 'x', sizeof(buf));
// error checking is ignored, for demonstration purpose
fd = open("abc", O_RDWR | O_CREAT, S_IWUSR | S_IRUSR);
ftruncate(fd, 0);
write(fd, buf, 4090);
// the file size is less than one page, but we allocate 2 page address space
addr = mmap(NULL, PAGE_SIZE * 2, PROT_WRITE, MAP_SHARED, fd, 0);
// it would crash if we read/write from addr[4096]
// extend the size after mmap
ftruncate(fd, PAGE_SIZE * 2);
// now we can access(read/write) addr[4096]...addr[4096*2 -1]
munmap(addr, PAGE_SIZE * 2);
close(fd);
exit(EXIT_SUCCESS);
}
But POSIX says:
If the size of the mapped file changes after the call to mmap() as a result of some other operation on the mapped file, the effect of references to portions of the mapped region that correspond to added or removed portions of the file is unspecified.
So I guess this is not a portable way. But is it guaranteed to work on Linux?

Why msync() doesn't change the st_mtime of file

I have a question when I use msync. Thank you very much for your help!
In brief, I mmap file A, and modify it, and the msync, but the st_mtime doesn't change. even munmap file A and exit the process, the st_mtime is also unchanged.
The following is the code.
int main() {
const char *file_name = "txt";
int ret = -1;
int fd = open(file_name, O_RDWR, 0666);
if (fd < 0) {
printf("FATAL, Fail to open file[%s]\n", file_name);
return -1;
}
struct stat st;
fstat(fd, &st);
void * buffer = mmap(NULL, st.st_size,
PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
if (MAP_FAILED == buffer) {
printf("FATAL, Fail to mmap, file[%s], size[%d]\n",
file_name, st.st_size);
return -1;
}
printf("m_time[%d]\n", st.st_mtime);
for (int i=0; i<10;i++) {
int *ptr = (int *)buffer;
printf("%d\n", *ptr);
*ptr += 1;
sleep(1);
ret = msync(buffer, st.st_size, MS_ASYNC);
if (0 != ret) {
printf("FATAL, Fail to msync, file[%s], size[%d]\n",
file_name, st.st_size);
return -1;
}
fstat(fd, &st);
printf("m_time[%d]\n", st.st_mtime);
}
ret = munmap(buffer, st.st_size);
if (0 != ret) {
printf("FATAL, Fail to munmap, file[%s], size[%d]\n",
file_name, st.st_size);
return -1;
}
fstat(fd, &st);
printf("m_time[%d]\n", st.st_mtime);
fsync(fd);
fstat(fd, &st);
printf("m_time[%d]\n", st.st_mtime);
return 0;
}
The relevant excerpt of the mmap manpage is:
The st_ctime and st_mtime field for a file mapped with PROT_WRITE and MAP_SHARED will be updated after a write to the mapped region, and before a subsequent msync(2) with the MS_SYNC or MS_ASYNC flag, if one occurs.
That means that, in your program, st_mtime might be updated anytime between the line which reads
*ptr += 1`
and and the line which reads
ret = msync(buffer, st.st_size, MS_ASYNC);
Your sleep(1) is in between those lines, which means that by the time the sleep occurs, the st_mtime might have already been modified. So when you fstat the file a second time, you might be getting the same value as when you statted it the first time, just because not enough time has elapsed.
Try putting your sleep(1) before the *ptr += 1. This should guarantee that at least one second elapses between the original fstat and the update to the st_mtime.
Looks like a fix is on the way.
http://thread.gmane.org/gmane.linux.kernel/1549524/focus=55700

shared memory between two process using mutex but missing data

using example I have created two process a master and a slave, to test shared memory IPC. Master creates shared memory and starts writing and after some time slave connects, this is working but onces slave connects its not receiving/getting all the data that master writes to share memory.
master code looks like this:
typedef struct custom_data_s {
int min;
int max;
/* for shared */
pthread_mutex_t ipc_mutex;
pthread_cond_t ipc_condvar;
} custom_data_t;
int main(void) {
int fd = -1;
custom_data_t *this_custom_data;
pthread_mutexattr_t mutex_attr;
pthread_condattr_t cond_attr;
fd = shm_open("/A_CUSTOM_DATA", O_RDWR | O_CREAT | O_EXCL , (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
if(fd == -1) {
printf("ERROR fd %d %s\n",fd,strerror(errno));
}
if (ftruncate (fd,sizeof(custom_data_t)) == -1) {
printf("ERROR trucate fd %d %s\n",fd,strerror(errno));
exit(1);
}
this_custom_data = (custom_data_t *) mmap(NULL, sizeof(custom_data_t), PROT_READ | PROT_WRITE , MAP_SHARED ,fd ,0);
if(this_custom_data ==(custom_data_t *) -1) {
printf("ERROR mapping fd %d %s\n",fd,strerror(errno));
exit(1);
}
close(fd);
pthread_mutexattr_init(&mutex_attr);
pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED);
pthread_mutex_init(&this_custom_data->ipc_mutex, &mutex_attr);
pthread_condattr_init(&cond_attr);
pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
pthread_cond_init(&this_custom_data->ipc_condvar, &cond_attr);
for (fd=0; fd != 100000; fd++) {
pthread_mutex_lock(&this_custom_data->ipc_mutex);
this_custom_data->min = fd;
this_custom_data->max = fd+5;
pthread_cond_signal(&this_custom_data->ipc_condvar);
pthread_mutex_unlock(&this_custom_data->ipc_mutex);
}
/* Clean up and exit should check exit codes of all*/
pthread_mutexattr_destroy(&mutex_attr);
pthread_condattr_destroy(&cond_attr);
pthread_cond_destroy(&this_custom_data->ipc_condvar);
pthread_mutex_destroy(&this_custom_data->ipc_mutex);
if(0 != munmap(this_custom_data, sizeof(custom_data_t))) {
printf("ERROR unmapping %s\n",strerror(errno));
exit(1);
}
if (0 != shm_unlink("/A_CUSTOM_DATA")){
printf("ERROR unlinking %s\n",strerror(errno));
exit(1);
}
return 0;
}
For example master starts writing min and max to shared memory from 1 to 10000 after some time slave connects once slave connects it should read all data that is written by master but if in the code once slave connects, it still not reading all data, what I am doing wrong? Should there be another condition variable that slave sets? I am trying to learn shared memory and I think I am doing something wrong or not understanding how mutex and shared memory. In slave I am waiting for condition variable to get set, here is code for slave.
typedef struct custom_data_s {
int min;
int max;
/* for shared */
pthread_mutex_t ipc_mutex;
pthread_cond_t ipc_condvar;
} custom_data_t;
int main(void) {
int fd = -1;
custom_data_t *this_custom_data_ptr;
custom_data_t this_data;
int prv_packet = 0;
fd = shm_open("/A_CUSTOM_DATA", O_RDWR , (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
if(fd == -1) {
printf("ERROR fd %d %s\n",fd,strerror(errno));
}
if (ftruncate (fd,sizeof(custom_data_t)) == -1) {
printf("ERROR trucate fd %d %s\n",fd,strerror(errno));
exit(1);
}
this_custom_data_ptr = (custom_data_t *) mmap(NULL, sizeof(custom_data_t), PROT_READ | PROT_WRITE , MAP_SHARED ,fd ,0);
if(this_custom_data_ptr ==(custom_data_t *) -1) {
printf("ERROR mapping fd %d %s\n",fd,strerror(errno));
exit(1);
}
close(fd);
while (1) {
pthread_mutex_lock(&this_custom_data_ptr->ipc_mutex);
pthread_cond_wait(&this_custom_data_ptr->ipc_condvar, &this_custom_data_ptr->ipc_mutex);
memcpy(&this_data, this_custom_data_ptr, sizeof(this_custom_data_ptr));
if (prv_packet == 0){
printf ("got first ");
prv_packet = this_data.min;
}
if ((prv_packet +1) != this_data.min){
printf ("error prv:%d this:%d\n", prv_packet, this_data.min);
}
pthread_mutex_unlock(&this_custom_data_ptr->ipc_mutex);
prv_packet = this_data.min;
}
return 0;
}
What am I doing wrong? How do I synchronize so that once slave is connected it will not loose any data but if its not connected then master will not be blocked also.

Berkeley DB: stuck at futex_wait because of previous abnormal quit during c api call

I'm programming in C, using berkeley db 4.3 (/usr/lib64/libdb-4.3.so) on RHEL5.6 with kernel 2.6.18-238_xen_AMD64.
In my tests (writing 1,000,000 key/value pairs), if one process quitted abnormally (ctrl + c, kill, or assert fails) while an operation on the db is in process, later operation on that db would be blocked upon opening. Strace shows that the process stuck at a futex(ptr_to_something, FUTEX_WAIT, 2, NULL) call after opening the __db.00x(e.g __db.001, __db.002, __db.003) files.
The only way I know to clear the lock is to remove __db.00x files, and following tests showed that the database is not damaged. It meets my requirement, but I'm just wondering whether there's a better(or more elegant) way to solve this problem.
Here I listed some strace stderr and the code to operate the database which may help.
some of the strace stderr
...
open("__db.001", O_RDWR) = 3
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
fstat(3, {st_mode=S_IFREG|0640, st_size=24576, ...}) = 0
close(3) = 0
open("__db.001", O_RDWR) = 3
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
mmap(NULL, 24576, PROT_READ|PROT_WRITE, MAP_SHARED, 3, 0) = 0x2afcc4149000
close(3) = 0
futex(0x2afcc4149000, FUTEX_WAIT, 2, NULL **[[stuck here]]**
code to operate the database
typedef DB* db_handle;
db_handle bdb_open(const char *filename, u_int32_t cache_size_mb)
{
int ret;
DB_ENV *env;
db_handle dbp;
u_int32_t flags = DB_CREATE | DB_THREAD | DB_INIT_LOCK | DB_INIT_MPOOL | DB_INIT_LOCK ;
u_int32_t gb = cache_size_mb / 1024, mb = cache_size_mb % 1024;
if (ret = db_env_create(&env, 0)) {
fprintf(stderr, "db_env_create:%d, %s\n", ret, db_strerror(ret));
exit(EXIT_FAILURE);
}
if (ret = env->set_timeout(env, 3 * 1000000, DB_SET_LOCK_TIMEOUT)) {
fprintf(stderr, "env->set_timeout:%d, %s\n", ret, db_strerror(ret));
exit(EXIT_FAILURE);
}
if (ret = env->set_lk_detect(env, DB_LOCK_DEFAULT)) { /* this seems to be of no use in my case */
fprintf(stderr, "env->set_lk_detect:%d, %s\n", ret, db_strerror(ret));
exit(EXIT_FAILURE);
}
if (ret = env->set_cachesize(env, gb, mb * 1024 * 1024, 0)) {
fprintf(stderr, "env->set_cachesize:%d, %s\n", ret, db_strerror(ret));
exit(EXIT_FAILURE);
}
if ((ret = env->open(env, NULL, flags, 0)) != 0) {
fprintf(stderr, "db_env_open:%d, %s\n", ret, db_strerror(ret));
exit(EXIT_FAILURE);
}
if (ret = db_create(&dbp, env, 0)) {
fprintf(stderr, "db_create:%d, %s\n", ret, db_strerror(ret));
exit(EXIT_FAILURE);
}
if (ret = dbp->open(dbp, NULL, filename, NULL, DB_BTREE, flags, 0664)) {
fprintf(stderr, "dbp->open:%d, %s\n", ret, db_strerror(ret));
exit(EXIT_FAILURE);
}
return dbp;
}
int bdb_put(db_handle db, void* key, u_int32_t keylen, void* val, u_int32_t vallen)
{
DBT dkey, dval;
bzero(&dkey, sizeof(dkey));
bzero(&dval, sizeof(dval));
dkey.data = key, dkey.size = keylen;
dval.data = val, dval.size = vallen;
return db->put(db, NULL, &dkey, &dval, 0);
}
int bdb_get(db_handle db, void* key, const u_int32_t keylen,
void* buf, u_int32_t buflen, u_int32_t* nwrite)
{
DBT dkey, dval;
bzero(&dkey, sizeof(dkey));
bzero(&dval, sizeof(dval));
dkey.data = key, dkey.size = keylen;
dval.data = buf, dval.ulen = buflen, dval.flags = DB_DBT_USERMEM;
int ret = db->get(db, NULL, &dkey, &dval, 0);
if (ret == 0 && nwrite != NULL)
*nwrite = dval.size;
return ret;
}
The __db* files contain the names of locks, but not the locks themselves.
The pthread locks are implemented on top of kernel futexes. The process
that was killed likely had an active lock when killed.
Try running "db_recover -h $DBHOME" to clear stale locks.
There is also a callback that can be added to automate "stale lock"
remover.
(aside)
Your code almost certainly needs to handle DB_RUNRECOVERY and DB_VERSION_MISMATCH
error codes from env->open for a robust implementation. Reopen with DB_RECOVER
set will handle.
On older linux, there is also the possibility of "stale futexes" (i.e. a futex
that was locked by a process that died) that can only be cleared by
rebooting (there is another way, using "robust mutexes", that sets
a flag that permits another process to unlock previously locked mutexes,
but that isn't implemented in Berkeley DB).

Resources