How does generating an interrupt (calling irqfd) calls an interrupt on the KVM VM? - linux

The KVM irqfd ioctl starts the irqfd for a file descriptor.
It does this:
case KVM_IRQFD: {
struct kvm_irqfd data;
r = -EFAULT;
if (copy_from_user(&data, argp, sizeof(data)))
goto out;
r = kvm_irqfd(kvm, &data);
break;
}
where kvm_irqfd is here
and calls kvm_irqfd_assign which initiates a wakeup queue:
init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
That is, irqfd_wakeup does this:
if (flags & EPOLLIN) {
u64 cnt;
eventfd_ctx_do_read(irqfd->eventfd, &cnt);
idx = srcu_read_lock(&kvm->irq_srcu);
do {
seq = read_seqcount_begin(&irqfd->irq_entry_sc);
irq = irqfd->irq_entry;
} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
/* An event has been signaled, inject an interrupt */
if (kvm_arch_set_irq_inatomic(&irq, kvm,
KVM_USERSPACE_IRQ_SOURCE_ID, 1,
false) == -EWOULDBLOCK)
schedule_work(&irqfd->inject);
srcu_read_unlock(&kvm->irq_srcu, idx);
ret = 1;
}
As you can see in schedule_work(&irqfd->inject), it schedules the inject function, which is here:
static void
irqfd_inject(struct work_struct *work)
{
struct kvm_kernel_irqfd *irqfd =
container_of(work, struct kvm_kernel_irqfd, inject);
struct kvm *kvm = irqfd->kvm;
if (!irqfd->resampler) {
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
false);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
false);
} else
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
irqfd->gsi, 1, false);
}
It calls kvm_set_irq defined here which does this:
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
bool line_status)
{
struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
int ret = -1, i, idx;
trace_kvm_set_irq(irq, level, irq_source_id);
/* Not possible to detect if the guest uses the PIC or the
* IOAPIC. So set the bit in both. The guest will ignore
* writes to the unused one.
*/
idx = srcu_read_lock(&kvm->irq_srcu);
i = kvm_irq_map_gsi(kvm, irq_set, irq);
srcu_read_unlock(&kvm->irq_srcu, idx);
while (i--) {
int r;
r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
line_status);
if (r < 0)
continue;
ret = r + ((ret < 0) ? 0 : ret);
}
return ret;
}
It looks like it finally calls something at:
r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
line_status);
This set function is filled by this.
It sets to this function:
static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id,
int level, bool line_status)
{
unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS;
if (!vgic_valid_spi(kvm, spi_id))
return -EINVAL;
return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
}
which calls kvm_vgic_inject_irq which finally calls vgic_put_irq which calls this:
void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq)
{
struct vgic_dist *dist = &kvm->arch.vgic;
if (!kref_put(&irq->refcount, vgic_irq_release))
return;
list_del(&irq->lpi_list);
dist->lpi_list_count--;
kfree(irq);
}
but I don't see how the GIC is called here, I only see the list being deleted.
I thought here it would send the interrupt to the GIC, which would then call the VM somehow.
I'm trying to understand how calling the irqfd file descriptor ends up calling an interrupt in the VM.

VGIC is for arm, you should check arm support file. While x86 is using either APIC or PIC. mostly APIC now.
You can check the specification of how those IRQ chip works to transfer the external signal to the destination core(vcpu).
For example, if you were using a x86 virtual machine(I have no idea of VGIC) which is using IOAPIC, there are 24 pins for example(emulated), and you should understand the APIC(hardware), then you know how it works.
https://elixir.bootlin.com/linux/v5.2.12/source/arch/x86/kvm/irq_comm.c#L271
https://elixir.bootlin.com/linux/v5.2.12/source/arch/x86/kvm/irq_comm.c#L38

Related

How to handle more than one SIGSEGV occurrence in linux?

I have written a program to scan kernel memory for a pattern from user space. I run it from root. I expect that it will generate SIGSEGVs when it hits pages that aren't accessible; I would like to ignore those faults and just jump to the next page to continue the search. I have set up a signal handler that works fine for the first occurrence, and it continues onward as expected. However, when a second SIGSEGV occurs, the handler is ignored (it was reregistered after the first occurrence) and the program terminates. The relevant portions of the code are:
jmp_buf restore_point;
void segv_handler(int sig, siginfo_t* info, void* ucontext)
{
longjmp(restore_point, SIGSEGV);
}
void setup_segv_handler()
{
struct sigaction sa;
sa.sa_flags = SA_SIGINFO|SA_RESTART|SA_RESETHAND;
sigemptyset (&sa.sa_mask);
sa.sa_sigaction = &segv_handler;
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
fprintf(stderr, "failed to setup SIGSEGV handler\n");
}
}
unsigned long search_kernel_memory_area(unsigned long start_address, size_t area_len, const void* pattern, size_t pattern_len)
{
int fd;
char* kernel_mem;
fd = open("/dev/kmem", O_RDONLY);
if (fd < 0)
{
perror("open /dev/kmem failed");
return -1;
}
unsigned long page_size = sysconf(_SC_PAGESIZE);
unsigned long page_aligned_offset = (start_address/page_size)*page_size;
unsigned long area_pages = area_len/page_size + (area_len%page_size ? 1 : 0);
kernel_mem =
mmap(0, area_pages,
PROT_READ, MAP_SHARED,
fd, page_aligned_offset);
if (kernel_mem == MAP_FAILED)
{
perror("mmap failed");
return -1;
}
if (!mlock((const void*)kernel_mem,area_len))
{
perror("mlock failed");
return -1;
}
unsigned long offset_into_page = start_address-page_aligned_offset;
unsigned long start_area_address = (unsigned long)kernel_mem + offset_into_page;
unsigned long end_area_address = start_area_address+area_len-pattern_len+1;
unsigned long addr;
setup_segv_handler();
for (addr = start_area_address; addr < end_area_address;addr++)
{
unsigned char* kmp = (unsigned char*)addr;
unsigned char* pmp = (unsigned char*)pattern;
size_t index = 0;
for (index = 0; index < pattern_len; index++)
{
if (setjmp(restore_point) == 0)
{
unsigned char p = *pmp;
unsigned char k = *kmp;
if (k != p)
{
break;
}
pmp++;
kmp++;
}
else
{
addr += page_size -1;
setup_segv_handler();
break;
}
}
if (index >= pattern_len)
{
return addr;
}
}
munmap(kernel_mem,area_pages);
close(fd);
return 0;
}
I realize I can use functions like memcmp to avoid programming the matching part directly (I did this initially), but I subsequently wanted to insure the finest grained control for recovering from the faults so I could see exactly what was happening.
I scoured the Internet to find information about this behavior, and came up empty. The linux system I am running this under is arm 3.12.30.
If what I am trying to do is not possible under linux, is there some way I can get the current state of the kernel pages from user space (which would allow me to avoid trying to search pages that are inaccessible.) I searched for calls that might provide such information, but also came up empty.
Thanks for your help!
While longjmp is perfectly allowed to be used in the signal handler (the function is known as async-signal-safe, see man signal-safety) and effectively exits from the signal handling, it doesn't restore signal mask. The mask is automatically modified at the time when signal handler is called to block new SIGSEGV signal to interrupt the handler.
While one may restore signal mask manually, it is better (and simpler) to use siglongjmp function instead: aside from the effect of longjmp, it also restores the signal mask. Of course, in that case sigsetjmp function should be used instead of setjmp:
// ... in main() function
if(sigsetjmp(restore_point, 1)) // Aside from other things, store signal mask
// ...
// ... in the signal handler
siglongjmp(restore_point); // Also restore signal mask as it was at sigsetjmp() call

Linux - Control Flow in a linux kernel module

I am learning to write kernel modules and in one of the examples I had to make sure that a thread executed 10 times and exits, so I wrote this according to what I have studied:
#include <linux/module.h>
#include <linux/kthread.h>
struct task_struct *ts;
int flag = 0;
int id = 10;
int function(void *data) {
int n = *(int*)data;
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(n*HZ); // after doing this it executed infinitely and i had to reboot
while(!kthread_should_stop()) {
printk(KERN_EMERG "Ding");
}
flag = 1;
return 0;
}
int init_module (void) {
ts = kthread_run(function, (void *)&id, "spawn");
return 0;
}
void cleanup_module(void) {
if (flag==1) { return; }
else { if (ts != NULL) kthread_stop(ts);
}
return;
}
MODULE_LICENSE("GPL");
What I want to know is :
a) How to make thread execute 10 times like a loop
b) How does the control flows in these kind of processes that is if we make it to execute 10 times then does it go back and forth between function and cleanup_module or init_module or what exactly happens?
If you control kthread with kthread_stop, the kthread shouldn't exit until be ing stopped (see also that answer). So, after executing all operations, kthread should wait until stopped.
Kernel already implements kthread_worker mechanism, when kthread just executes works, added to it.
DEFINE_KTHREAD_WORKER(worker);
struct my_work
{
struct kthread_work *work; // 'Base' class
int n;
};
void do_work(struct kthread_work *work)
{
struct my_work* w = container_of(work, struct my_work, work);
printk(KERN_EMERG "Ding %d", w->n);
// And free work struct at the end
kfree(w);
}
int init_module (void) {
int i;
for(i = 0; i < 10; i++)
{
struct my_work* w = kmalloc(sizeof(struct my_work), GFP_KERNEL);
init_kthread_work(&w->work, &do_work);
w->n = i + 1;
queue_kthread_work(&worker, &w->work);
}
ts = kthread_run(&kthread_worker_fn, &worker, "spawn");
return 0;
}
void cleanup_module(void) {
kthread_stop(ts);
}

How to implement poll in linux driver that notices FD closed

I'm implementing a misc device driver for linux.
This driver implements file_operations::poll and
I want to make it so that poll(2) would return POLLHUP if the descriptor is closed.
Supposed driver client code(userland code) follows.
void ThreadA(int fd){
// initialization codes...
pfd[0].fd = fd;
pfd[0].event = POLLIN;
int r = poll(pfd, 1, -1);
if(r > 0 && pfd[0].revent & POLLHUP){
// Detect fd is closed
return; // Exit thread
}
}
void ThreadB(int fd){
// waiting some events. ex.signals
// I expect close(fd) will cause poll(2) return and ThreadA will exit.
close(fd);
return;
}
But I could not implement this behavior to in my driver code.
Calling poll(2) never returns even if descriptor is closed. so threadA never exits.
Trivial test driver code follows.
static wait_queue_head_t q;
static int CLOSED = 0;
int my_open(struct inode *a, struct file *b){
printk(KERN_DEBUG "my_open");
return 0;
}
int my_release(struct inode *a, struct file *b){
printk(KERN_DEBUG "my_release");
CLOSED = 1;
wake_up_interruptible(&q);
// I expect this call will wake up q and recall my_poll.
// but doesn't
return 0;
}
unsigned int my_poll(struct file *a, struct poll_table_struct *b){
printk(KERN_DEBUG "my_poll");
poll_wait(file, &q, a);
if(CLOSED != 0)
return POLLHUP;
return 0;
}
static const struct file_operations my_fops = {
.owner = THIS_MODULE,
.open = &my_open,
.release = &my_release,
.poll = &my_poll
};
static struct miscdevice mydevice =
{
.minor = MISC_DYNAMIC_MINOR,
.name = "TESTDEV",
.fops = &my_fops
};
static int __init myinit(void){
init_waitqueue_head(&q);
misc_register(&mydevice);
return 0;
}
static void __exit myexit(void){
misc_deregister(&mydevice);
}
module_init(myinit);
module_exit(myexit);
MODULE_LICENSE("GPL");
I think calling wake_up_interruptible() doesn't effect in my_release().
so that my_poll() will never be recalled and poll(2) will never return.
How should I implement my_poll() in a correct manner?
My test environment:
kernel is linux-3.10.20
The manual page for close warns:
It is probably unwise to close file descriptors while they may be in use by
system calls in other threads in the same process. Since a file descriptor
may be reused, there are some obscure race conditions that may cause unintended
side effects.
I suspect that something in the higher levels of the kernel is cancelling your poll operation when you execute close, before the release() function actually gets called. I'd think about solving your problem a different way.

Call to ioctl() on a usb device using usbfs does not work

I am trying to create my own driver for my Gamepad right now, I found out the original reason why I wanted to create it does not exist but I still want to do it for the experience. So please don't tell me there is a better way to do this than writing my own driver.
The part in kernelspace with the ioctl function that should be called is:
static int xpad_ioctl (struct usb_interface *intf, unsigned int code,void *buf) {
//struct usb_xpad *xpad = usb_get_intfdata(intf);
printk(KERN_INFO"(Ongy)IOCTL called\n");
//if (_IOC_TYPE(code) != XPAD_IOMAGIC) return -ENOTTY;
//if (_IOC_NR(code) > XPAD_IOMAX) return -ENOTTY;
switch(code){
case XPAD_IORMAP:
printk(KERN_INFO"(Ongy)IORMAP called\n");
break;
default:
return -EINVAL;
}
return 0;
}
static struct usb_driver xpad_driver =
{
.name = "Cyborg-V5-driver",
.probe = xpad_probe,
.disconnect = xpad_disconnect,
.unlocked_ioctl = xpad_ioctl,
.id_table = xpad_table,
};
The part in userspace to call it is (this is part of a Qt-application):
int openfile() {
char *device = "/dev/bus/usb/005/009";
printf("Opening device %s\n", device);
return open(device, /*O_RDONLY*/O_WRONLY | O_NONBLOCK );
}
[...] the closefile(int file_desc) is missing here, this and the openfile functions exist because of me not knowing one can call "::open()" when Qt overrides function calls.
void MainContainer::callioctl() {
int file_desc, ret_val;
errno = 0;
file_desc = openfile();
if (file_desc==-1){
printf("Ioctl notcalled because of: error %s\n", strerror(errno));
}
else
{
errno = 0;
//struct usbdevfs_getdriver* driver = (usbdevfs_getdriver*)malloc(sizeof(struct usbdevfs_getdriver));
struct mappingpair* pair = (mappingpair*)malloc(sizeof(struct mappingpair));
ret_val = ioctl(file_desc, XPAD_IORMAP, pair);
//printf("Drivername %s\n", driver->driver);
closefile(file_desc);
if (ret_val==-1) printf("Ioctl failed with error %s\n", strerror(errno));
else printf("Ioctl call successfull\n");
}
}
ok, the string to the file I open I get with a call to lsusb and change it by hand in the code, this is only for debugging and until I get the ioctl calls working
When I call the callioctl() it prints:
Ioctl failed with error Unpassender IOCTL (I/O-Control) für das Gerät
The German part means "wrong ioctl (I/O-Control) for the device" and nothing appears in dmesg, that is why I think my ioctl function in the driver is not called.
If you look at http://www.hep.by/gnu/kernel/usb/usbfs.html it says that to send an ioctl to the usb_driver device you need to do:
struct usbdevfs_ioctl {
int ifno;
int ioctl_code;
void *data;
};
/* user mode call looks like this.
* 'request' becomes the driver->ioctl() 'code' parameter.
* the size of 'param' is encoded in 'request', and that data
* is copied to or from the driver->ioctl() 'buf' parameter.
*/
static int
usbdev_ioctl (int fd, int ifno, unsigned request, void *param)
{
struct usbdevfs_ioctl wrapper;
wrapper.ifno = ifno;
wrapper.ioctl_code = request;
wrapper.data = param;
return ioctl (fd, USBDEVFS_IOCTL, &wrapper);
}
The documentation is listing usb device under /proc/bus so admittedly this may have changed.

Kernel Panic after changes in sys_close

I'm doing a course on operating systems and we work in Linux Red Hat 8.0
AS part of an assignment I had to change sys close and sys open. Changes to sys close passed without an incident, but when I introduce the changes to sys close suddenly the OS encounters an error during booting, claiming it cannot mount root fs, and invokes panic. EIP is reportedly at sys close when this happens.
Here are the changes I made (look for the "HW1 additions" comment):
In fs/open.c:
asmlinkage long sys_open(const char * filename, int flags, int mode)
{
char * tmp;
int fd, error;
event_t* new_event;
#if BITS_PER_LONG != 32
flags |= O_LARGEFILE;
#endif
tmp = getname(filename);
fd = PTR_ERR(tmp);
if (!IS_ERR(tmp)) {
fd = get_unused_fd();
if (fd >= 0) {
struct file *f = filp_open(tmp, flags, mode);
error = PTR_ERR(f);
if (IS_ERR(f))
goto out_error;
fd_install(fd, f);
}
/* HW1 additions */
if (current->record_flag==1){
new_event=(event_t*)kmalloc(sizeof(event_t), GFP_KERNEL);
if (!new_event){
new_event->type=Open;
strcpy(new_event->filename, tmp);
file_queue_add(*new_event, current->queue);
}
}
/* End HW1 additions */
out:
putname(tmp);
}
return fd;
out_error:
put_unused_fd(fd);
fd = error;
goto out;
}
asmlinkage long sys_close(unsigned int fd)
{
struct file * filp;
struct files_struct *files = current->files;
event_t* new_event;
char* tmp = files->fd[fd]->f_dentry->d_name.name;
write_lock(&files->file_lock);
if (fd >= files->max_fds)
goto out_unlock;
filp = files->fd[fd];
if (!filp)
goto out_unlock;
files->fd[fd] = NULL;
FD_CLR(fd, files->close_on_exec);
__put_unused_fd(files, fd);
write_unlock(&files->file_lock);
/* HW1 additions */
if(current->record_flag == 1){
new_event=(event_t*)kmalloc(sizeof(event_t), GFP_KERNEL);
if (!new_event){
new_event->type=Close;
strcpy(new_event->filename, tmp);
file_queue_add(*new_event, current->queue);
}
}
/* End HW1 additions */
return filp_close(filp, files);
out_unlock:
write_unlock(&files->file_lock);
return -EBADF;
}
The task_struct defined in schedule.h was changed at the end to include:
unsigned int record_flag; /* when zero: do not record. when one: record. */
file_queue* queue;
And file queue as well as event t are defined in a separate file as follows:
typedef enum {Open, Close} EventType;
typedef struct event_t{
EventType type;
char filename[256];
}event_t;
typedef struct file_quque_t{
event_t queue[101];
int head, tail;
}file_queue;
file queue add works like this:
void file_queue_add(event_t event, file_queue* queue){
queue->queue[queue->head]=event;
queue->head = (queue->head+1) % 101;
if (queue->head==queue->tail){
queue->tail=(queue->tail+1) % 101;
}
}
if (!new_event) {
new_event->type = …
That's equivalent to if (new_event == NULL). I think you mean if (new_event != NULL), which the kernel folks typically write as if (new_event).
Can you please post the stackdump of the error. I don't see a place where queue_info structure is allocated memory. One more thing is you cannot be sure that process record_flag will be always zero if unassigned in kernel, because kernel is a long running program and memory contains garbage.
Its also possible to check the exact location in the function is occurring by looking at the stack trace.

Resources