From db03b428278501e4da4339d7528d8408f050d0ac Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 17 Aug 2020 14:45:55 +0530 Subject: [PATCH 01/32] um: Convert tasklets to use new tasklet_setup() API In preparation for unconditionally passing the struct tasklet_struct pointer to all tasklet callbacks, switch to using the new tasklet_setup() and from_tasklet() to pass the tasklet pointer explicitly. Signed-off-by: Romain Perier Signed-off-by: Allen Pais Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_kern.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 555203e3e7b4..aa6ba9d61e9b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1196,9 +1196,9 @@ static int vector_net_close(struct net_device *dev) /* TX tasklet */ -static void vector_tx_poll(unsigned long data) +static void vector_tx_poll(struct tasklet_struct *t) { - struct vector_private *vp = (struct vector_private *)data; + struct vector_private *vp = from_tasklet(vp, t, tx_poll); vp->estats.tx_kicks++; vector_send(vp->tx_queue); @@ -1629,7 +1629,7 @@ static void vector_eth_configure( }); dev->features = dev->hw_features = (NETIF_F_SG | NETIF_F_FRAGLIST); - tasklet_init(&vp->tx_poll, vector_tx_poll, (unsigned long)vp); + tasklet_setup(&vp->tx_poll, vector_tx_poll); INIT_WORK(&vp->reset_tx, vector_reset_tx); timer_setup(&vp->tl, vector_timer_expire, 0); From 72d3e093afae79611fa38f8f2cfab9a888fe66f2 Mon Sep 17 00:00:00 2001 From: Christopher Obbard Date: Tue, 27 Oct 2020 15:30:22 +0000 Subject: [PATCH 02/32] um: random: Register random as hwrng-core device The UML random driver creates a dummy device under the guest, /dev/hw_random. When this file is read from the guest, the driver reads from the host machine's /dev/random, in-turn reading from the host kernel's entropy pool. This entropy pool could have been filled by a hardware random number generator or just the host kernel's internal software entropy generator. Currently the driver does not fill the guests kernel entropy pool, this requires a userspace tool running inside the guest (like rng-tools) to read from the dummy device provided by this driver, which then would fill the guest's internal entropy pool. This all seems quite pointless when we are already reading from an entropy pool, so this patch aims to register the device as a hwrng device using the hwrng-core framework. This not only improves and cleans up the driver, but also fills the guest's entropy pool without having to resort to using extra userspace tools in the guest. This is typically a nuisance when booting a guest: the random pool takes a long time (~200s) to build up enough entropy since the dummy hwrng is not used to fill the guest's pool. This port was originally attempted by Alexander Neville "dark" (in CC, discussion in Link), but the conversation there stalled since the handling of -EAGAIN errors were no removed and longer handled by the driver. This patch attempts to use the existing method of error handling but utilises the new hwrng core. The issue can be noticed when booting a UML guest: [ 2.560000] random: fast init done [ 214.000000] random: crng init done With the patch applied, filling the pool becomes a lot quicker: [ 2.560000] random: fast init done [ 12.000000] random: crng init done Cc: Alexander Neville Link: https://lore.kernel.org/lkml/20190828204609.02a7ff70@TheDarkness/ Link: https://lore.kernel.org/lkml/20190829135001.6a5ff940@TheDarkness.local/ Cc: Sjoerd Simons Signed-off-by: Christopher Obbard Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/random.c | 101 ++++++++------------------------- drivers/char/hw_random/Kconfig | 16 +++--- 2 files changed, 33 insertions(+), 84 deletions(-) diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index ce115fce52f0..e4b9b2ce9abf 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -18,9 +19,8 @@ #include /* - * core module and version information + * core module information */ -#define RNG_VERSION "1.0.0" #define RNG_MODULE_NAME "hw_random" /* Changed at init time, in the non-modular case, and at module load @@ -28,88 +28,36 @@ * protects against a module being loaded twice at the same time. */ static int random_fd = -1; -static DECLARE_WAIT_QUEUE_HEAD(host_read_wait); +static struct hwrng hwrng = { 0, }; +static DECLARE_COMPLETION(have_data); -static int rng_dev_open (struct inode *inode, struct file *filp) +static int rng_dev_read(struct hwrng *rng, void *buf, size_t max, bool block) { - /* enforce read-only access to this chrdev */ - if ((filp->f_mode & FMODE_READ) == 0) - return -EINVAL; - if ((filp->f_mode & FMODE_WRITE) != 0) - return -EINVAL; + int ret; - return 0; -} - -static atomic_t host_sleep_count = ATOMIC_INIT(0); - -static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, - loff_t *offp) -{ - u32 data; - int n, ret = 0, have_data; - - while (size) { - n = os_read_file(random_fd, &data, sizeof(data)); - if (n > 0) { - have_data = n; - while (have_data && size) { - if (put_user((u8) data, buf++)) { - ret = ret ? : -EFAULT; - break; - } - size--; - ret++; - have_data--; - data >>= 8; - } - } - else if (n == -EAGAIN) { - DECLARE_WAITQUEUE(wait, current); - - if (filp->f_flags & O_NONBLOCK) - return ret ? : -EAGAIN; - - atomic_inc(&host_sleep_count); + for (;;) { + ret = os_read_file(random_fd, buf, max); + if (block && ret == -EAGAIN) { add_sigio_fd(random_fd); - add_wait_queue(&host_read_wait, &wait); - set_current_state(TASK_INTERRUPTIBLE); + ret = wait_for_completion_killable(&have_data); - schedule(); - remove_wait_queue(&host_read_wait, &wait); + ignore_sigio_fd(random_fd); + deactivate_fd(random_fd, RANDOM_IRQ); - if (atomic_dec_and_test(&host_sleep_count)) { - ignore_sigio_fd(random_fd); - deactivate_fd(random_fd, RANDOM_IRQ); - } + if (ret < 0) + break; + } else { + break; } - else - return n; - - if (signal_pending (current)) - return ret ? : -ERESTARTSYS; } - return ret; + + return ret != -EAGAIN ? ret : 0; } -static const struct file_operations rng_chrdev_ops = { - .owner = THIS_MODULE, - .open = rng_dev_open, - .read = rng_dev_read, - .llseek = noop_llseek, -}; - -/* rng_init shouldn't be called more than once at boot time */ -static struct miscdevice rng_miscdev = { - HWRNG_MINOR, - RNG_MODULE_NAME, - &rng_chrdev_ops, -}; - static irqreturn_t random_interrupt(int irq, void *data) { - wake_up(&host_read_wait); + complete(&have_data); return IRQ_HANDLED; } @@ -126,18 +74,19 @@ static int __init rng_init (void) goto out; random_fd = err; - err = um_request_irq(RANDOM_IRQ, random_fd, IRQ_READ, random_interrupt, 0, "random", NULL); if (err) goto err_out_cleanup_hw; sigio_broken(random_fd, 1); + hwrng.name = RNG_MODULE_NAME; + hwrng.read = rng_dev_read; + hwrng.quality = 1024; - err = misc_register (&rng_miscdev); + err = hwrng_register(&hwrng); if (err) { - printk (KERN_ERR RNG_MODULE_NAME ": misc device register " - "failed\n"); + pr_err(RNG_MODULE_NAME " registering failed (%d)\n", err); goto err_out_cleanup_hw; } out: @@ -161,8 +110,8 @@ static void cleanup(void) static void __exit rng_cleanup(void) { + hwrng_unregister(&hwrng); os_close_file(random_fd); - misc_deregister (&rng_miscdev); } module_init (rng_init); diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index e92c4d9469d8..5952210526aa 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -540,15 +540,15 @@ endif # HW_RANDOM config UML_RANDOM depends on UML - tristate "Hardware random number generator" + select HW_RANDOM + tristate "UML Random Number Generator support" help This option enables UML's "hardware" random number generator. It attaches itself to the host's /dev/random, supplying as much entropy as the host has, rather than the small amount the UML gets from its - own drivers. It registers itself as a standard hardware random number - generator, major 10, minor 183, and the canonical device name is - /dev/hwrng. - The way to make use of this is to install the rng-tools package - (check your distro, or download from - http://sourceforge.net/projects/gkernel/). rngd periodically reads - /dev/hwrng and injects the entropy into /dev/random. + own drivers. It registers itself as a rng-core driver thus providing + a device which is usually called /dev/hwrng. This hardware random + number generator does feed into the kernel's random number generator + entropy pool. + + If unsure, say Y. From 09041c92f0aacbb6f5a252351d6e0a9e0ee9bcc5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Oct 2020 10:23:12 -0600 Subject: [PATCH 03/32] um: Add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for um. Cc: linux-um@lists.infradead.org Signed-off-by: Jens Axboe Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/asm/thread_info.h | 2 ++ arch/um/kernel/process.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 4c19ce4c49f1..3b1cb8b3b186 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -57,6 +57,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_NOTIFY_SIGNAL 3 /* signal notifications exist */ #define TIF_RESTART_BLOCK 4 #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_SYSCALL_AUDIT 6 @@ -67,6 +68,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 9505a7e87396..0fcdc374a9a1 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -99,7 +99,8 @@ void interrupt_end(void) if (need_resched()) schedule(); - if (test_thread_flag(TIF_SIGPENDING)) + if (test_thread_flag(TIF_SIGPENDING) || + test_thread_flag(TIF_NOTIFY_SIGNAL)) do_signal(regs); if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); From 97be7ceaf7fea68104824b6aa874cff235333ac1 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Fri, 13 Nov 2020 10:26:17 +0000 Subject: [PATCH 04/32] um: Remove use of asprinf in umid.c asprintf is not compatible with the existing uml memory allocation mechanism. Its use on the "user" side of UML results in a corrupt slab state. Fixes: 0d4e5ac7e780 ("um: remove uses of variable length arrays") Cc: stable@vger.kernel.org Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/os-Linux/umid.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index 1d7558dac75f..a3dd61521d24 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -137,20 +137,13 @@ static inline int is_umdir_used(char *dir) { char pid[sizeof("nnnnnnnnn")], *end, *file; int dead, fd, p, n, err; - size_t filelen; + size_t filelen = strlen(dir) + sizeof("/pid") + 1; - err = asprintf(&file, "%s/pid", dir); - if (err < 0) - return 0; + file = malloc(filelen); + if (!file) + return -ENOMEM; - filelen = strlen(file); - - n = snprintf(file, filelen, "%s/pid", dir); - if (n >= filelen) { - printk(UM_KERN_ERR "is_umdir_used - pid filename too long\n"); - err = -E2BIG; - goto out; - } + snprintf(file, filelen, "%s/pid", dir); dead = 0; fd = open(file, O_RDONLY); From ff9632d2a66512436d616ef4c380a0e73f748db1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 20 Nov 2020 21:08:51 +0100 Subject: [PATCH 05/32] um: Fix time-travel mode Since the time-travel rework, basic time-travel mode hasn't worked properly, but there's no longer a need for this WARN_ON() so just remove it and thereby fix things. Cc: stable@vger.kernel.org Fixes: 4b786e24ca80 ("um: time-travel: Rewrite as an event scheduler") Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/time.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 3d109ff3309b..8dafc3f2add4 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -260,11 +260,6 @@ static void __time_travel_add_event(struct time_travel_event *e, struct time_travel_event *tmp; bool inserted = false; - if (WARN(time_travel_mode == TT_MODE_BASIC && - e != &time_travel_timer_event, - "only timer events can be handled in basic mode")) - return; - if (e->pending) return; From fc6b6a872dcd48c6f39c7975836d75113db67d37 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Sat, 21 Nov 2020 23:13:56 -0500 Subject: [PATCH 06/32] um: ubd: Submit all data segments atomically Internally, UBD treats each physical IO segment as a separate command to be submitted in the execution pipe. If the pipe returns a transient error after a few segments have already been written, UBD will tell the block layer to requeue the request, but there is no way to reclaim the segments already submitted. When a new attempt to dispatch the request is done, those segments already submitted will get duplicated, causing the WARN_ON below in the best case, and potentially data corruption. In my system, running a UML instance with 2GB of RAM and a 50M UBD disk, I can reproduce the WARN_ON by simply running mkfs.fvat against the disk on a freshly booted system. There are a few ways to around this, like reducing the pressure on the pipe by reducing the queue depth, which almost eliminates the occurrence of the problem, increasing the pipe buffer size on the host system, or by limiting the request to one physical segment, which causes the block layer to submit way more requests to resolve a single operation. Instead, this patch modifies the format of a UBD command, such that all segments are sent through a single element in the communication pipe, turning the command submission atomic from the point of view of the block layer. The new format has a variable size, depending on the number of elements, and looks like this: +------------+-----------+-----------+------------ | cmd_header | segment 0 | segment 1 | segment ... +------------+-----------+-----------+------------ With this format, we push a pointer to cmd_header in the submission pipe. This has the advantage of reducing the memory footprint of executing a single request, since it allow us to merge some fields in the header. It is possible to reduce even further each segment memory footprint, by merging bitmap_words and cow_offset, for instance, but this is not the focus of this patch and is left as future work. One issue with the patch is that for a big number of segments, we now perform one big memory allocation instead of multiple small ones, but I wasn't able to trigger any real issues or -ENOMEM because of this change, that wouldn't be reproduced otherwise. This was tested using fio with the verify-crc32 option, and by running an ext4 filesystem over this UBD device. The original WARN_ON was: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at lib/refcount.c:28 refcount_warn_saturate+0x13f/0x141 refcount_t: underflow; use-after-free. Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.5.0-rc6-00002-g2a5bb2cf75c8 #346 Stack: 6084eed0 6063dc77 00000009 6084ef60 00000000 604b8d9f 6084eee0 6063dcbc 6084ef40 6006ab8d e013d780 1c00000000 Call Trace: [<600a0c1c>] ? printk+0x0/0x94 [<6004a888>] show_stack+0x13b/0x155 [<6063dc77>] ? dump_stack_print_info+0xdf/0xe8 [<604b8d9f>] ? refcount_warn_saturate+0x13f/0x141 [<6063dcbc>] dump_stack+0x2a/0x2c [<6006ab8d>] __warn+0x107/0x134 [<6008da6c>] ? wake_up_process+0x17/0x19 [<60487628>] ? blk_queue_max_discard_sectors+0x0/0xd [<6006b05f>] warn_slowpath_fmt+0xd1/0xdf [<6006af8e>] ? warn_slowpath_fmt+0x0/0xdf [<600acc14>] ? raw_read_seqcount_begin.constprop.0+0x0/0x15 [<600619ae>] ? os_nsecs+0x1d/0x2b [<604b8d9f>] refcount_warn_saturate+0x13f/0x141 [<6048bc8f>] refcount_sub_and_test.constprop.0+0x2f/0x37 [<6048c8de>] blk_mq_free_request+0xf1/0x10d [<6048ca06>] __blk_mq_end_request+0x10c/0x114 [<6005ac0f>] ubd_intr+0xb5/0x169 [<600a1a37>] __handle_irq_event_percpu+0x6b/0x17e [<600a1b70>] handle_irq_event_percpu+0x26/0x69 [<600a1bd9>] handle_irq_event+0x26/0x34 [<600a1bb3>] ? handle_irq_event+0x0/0x34 [<600a5186>] ? unmask_irq+0x0/0x37 [<600a57e6>] handle_edge_irq+0xbc/0xd6 [<600a131a>] generic_handle_irq+0x21/0x29 [<60048f6e>] do_IRQ+0x39/0x54 [...] ---[ end trace c6e7444e55386c0f ]--- Cc: Christopher Obbard Reported-by: Martyn Welch Signed-off-by: Gabriel Krisman Bertazi Tested-by: Christopher Obbard Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/ubd_kern.c | 193 ++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 77 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index eae8c83364f7..b12c1b0d3e1d 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -47,18 +47,25 @@ /* Max request size is determined by sector mask - 32K */ #define UBD_MAX_REQUEST (8 * sizeof(long)) +struct io_desc { + char *buffer; + unsigned long length; + unsigned long sector_mask; + unsigned long long cow_offset; + unsigned long bitmap_words[2]; +}; + struct io_thread_req { struct request *req; int fds[2]; unsigned long offsets[2]; unsigned long long offset; - unsigned long length; - char *buffer; int sectorsize; - unsigned long sector_mask; - unsigned long long cow_offset; - unsigned long bitmap_words[2]; int error; + + int desc_cnt; + /* io_desc has to be the last element of the struct */ + struct io_desc io_desc[]; }; @@ -525,12 +532,7 @@ static void ubd_handler(void) blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q); } - if ((io_req->error) || (io_req->buffer == NULL)) - blk_mq_end_request(io_req->req, io_req->error); - else { - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) - __blk_mq_end_request(io_req->req, io_req->error); - } + blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); } } @@ -946,6 +948,7 @@ static int ubd_add(int n, char **error_out) blk_queue_write_cache(ubd_dev->queue, true, false); blk_queue_max_segments(ubd_dev->queue, MAX_SG); + blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); if(err){ *error_out = "Failed to register device"; @@ -1289,37 +1292,74 @@ static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, *cow_offset += bitmap_offset; } -static void cowify_req(struct io_thread_req *req, unsigned long *bitmap, +static void cowify_req(struct io_thread_req *req, struct io_desc *segment, + unsigned long offset, unsigned long *bitmap, __u64 bitmap_offset, __u64 bitmap_len) { - __u64 sector = req->offset >> SECTOR_SHIFT; + __u64 sector = offset >> SECTOR_SHIFT; int i; - if (req->length > (sizeof(req->sector_mask) * 8) << SECTOR_SHIFT) + if (segment->length > (sizeof(segment->sector_mask) * 8) << SECTOR_SHIFT) panic("Operation too long"); if (req_op(req->req) == REQ_OP_READ) { - for (i = 0; i < req->length >> SECTOR_SHIFT; i++) { + for (i = 0; i < segment->length >> SECTOR_SHIFT; i++) { if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) ubd_set_bit(i, (unsigned char *) - &req->sector_mask); + &segment->sector_mask); } + } else { + cowify_bitmap(offset, segment->length, &segment->sector_mask, + &segment->cow_offset, bitmap, bitmap_offset, + segment->bitmap_words, bitmap_len); } - else cowify_bitmap(req->offset, req->length, &req->sector_mask, - &req->cow_offset, bitmap, bitmap_offset, - req->bitmap_words, bitmap_len); } -static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, - u64 off, struct bio_vec *bvec) +static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req, + struct request *req) { - struct ubd *dev = hctx->queue->queuedata; - struct io_thread_req *io_req; - int ret; + struct bio_vec bvec; + struct req_iterator iter; + int i = 0; + unsigned long byte_offset = io_req->offset; + int op = req_op(req); - io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC); + if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) { + io_req->io_desc[0].buffer = NULL; + io_req->io_desc[0].length = blk_rq_bytes(req); + } else { + rq_for_each_segment(bvec, req, iter) { + BUG_ON(i >= io_req->desc_cnt); + + io_req->io_desc[i].buffer = + page_address(bvec.bv_page) + bvec.bv_offset; + io_req->io_desc[i].length = bvec.bv_len; + i++; + } + } + + if (dev->cow.file) { + for (i = 0; i < io_req->desc_cnt; i++) { + cowify_req(io_req, &io_req->io_desc[i], byte_offset, + dev->cow.bitmap, dev->cow.bitmap_offset, + dev->cow.bitmap_len); + byte_offset += io_req->io_desc[i].length; + } + + } +} + +static struct io_thread_req *ubd_alloc_req(struct ubd *dev, struct request *req, + int desc_cnt) +{ + struct io_thread_req *io_req; + int i; + + io_req = kmalloc(sizeof(*io_req) + + (desc_cnt * sizeof(struct io_desc)), + GFP_ATOMIC); if (!io_req) - return -ENOMEM; + return NULL; io_req->req = req; if (dev->cow.file) @@ -1327,26 +1367,41 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, else io_req->fds[0] = dev->fd; io_req->error = 0; - - if (bvec != NULL) { - io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset; - io_req->length = bvec->bv_len; - } else { - io_req->buffer = NULL; - io_req->length = blk_rq_bytes(req); - } - io_req->sectorsize = SECTOR_SIZE; io_req->fds[1] = dev->fd; - io_req->cow_offset = -1; - io_req->offset = off; - io_req->sector_mask = 0; + io_req->offset = (u64) blk_rq_pos(req) << SECTOR_SHIFT; io_req->offsets[0] = 0; io_req->offsets[1] = dev->cow.data_offset; - if (dev->cow.file) - cowify_req(io_req, dev->cow.bitmap, - dev->cow.bitmap_offset, dev->cow.bitmap_len); + for (i = 0 ; i < desc_cnt; i++) { + io_req->io_desc[i].sector_mask = 0; + io_req->io_desc[i].cow_offset = -1; + } + + return io_req; +} + +static int ubd_submit_request(struct ubd *dev, struct request *req) +{ + int segs = 0; + struct io_thread_req *io_req; + int ret; + int op = req_op(req); + + if (op == REQ_OP_FLUSH) + segs = 0; + else if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) + segs = 1; + else + segs = blk_rq_nr_phys_segments(req); + + io_req = ubd_alloc_req(dev, req, segs); + if (!io_req) + return -ENOMEM; + + io_req->desc_cnt = segs; + if (segs) + ubd_map_req(dev, io_req, req); ret = os_write_file(thread_fd, &io_req, sizeof(io_req)); if (ret != sizeof(io_req)) { @@ -1357,22 +1412,6 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, return ret; } -static int queue_rw_req(struct blk_mq_hw_ctx *hctx, struct request *req) -{ - struct req_iterator iter; - struct bio_vec bvec; - int ret; - u64 off = (u64)blk_rq_pos(req) << SECTOR_SHIFT; - - rq_for_each_segment(bvec, req, iter) { - ret = ubd_queue_one_vec(hctx, req, off, &bvec); - if (ret < 0) - return ret; - off += bvec.bv_len; - } - return 0; -} - static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1385,17 +1424,12 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, spin_lock_irq(&ubd_dev->lock); switch (req_op(req)) { - /* operations with no lentgth/offset arguments */ case REQ_OP_FLUSH: - ret = ubd_queue_one_vec(hctx, req, 0, NULL); - break; case REQ_OP_READ: case REQ_OP_WRITE: - ret = queue_rw_req(hctx, req); - break; case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: - ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL); + ret = ubd_submit_request(ubd_dev, req); break; default: WARN_ON_ONCE(1); @@ -1483,22 +1517,22 @@ static int map_error(int error_code) * will result in unpredictable behaviour and/or crashes. */ -static int update_bitmap(struct io_thread_req *req) +static int update_bitmap(struct io_thread_req *req, struct io_desc *segment) { int n; - if(req->cow_offset == -1) + if (segment->cow_offset == -1) return map_error(0); - n = os_pwrite_file(req->fds[1], &req->bitmap_words, - sizeof(req->bitmap_words), req->cow_offset); - if (n != sizeof(req->bitmap_words)) + n = os_pwrite_file(req->fds[1], &segment->bitmap_words, + sizeof(segment->bitmap_words), segment->cow_offset); + if (n != sizeof(segment->bitmap_words)) return map_error(-n); return map_error(0); } -static void do_io(struct io_thread_req *req) +static void do_io(struct io_thread_req *req, struct io_desc *desc) { char *buf = NULL; unsigned long len; @@ -1513,21 +1547,20 @@ static void do_io(struct io_thread_req *req) return; } - nsectors = req->length / req->sectorsize; + nsectors = desc->length / req->sectorsize; start = 0; do { - bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); + bit = ubd_test_bit(start, (unsigned char *) &desc->sector_mask); end = start; while((end < nsectors) && - (ubd_test_bit(end, (unsigned char *) - &req->sector_mask) == bit)) + (ubd_test_bit(end, (unsigned char *) &desc->sector_mask) == bit)) end++; off = req->offset + req->offsets[bit] + start * req->sectorsize; len = (end - start) * req->sectorsize; - if (req->buffer != NULL) - buf = &req->buffer[start * req->sectorsize]; + if (desc->buffer != NULL) + buf = &desc->buffer[start * req->sectorsize]; switch (req_op(req->req)) { case REQ_OP_READ: @@ -1567,7 +1600,8 @@ static void do_io(struct io_thread_req *req) start = end; } while(start < nsectors); - req->error = update_bitmap(req); + req->offset += len; + req->error = update_bitmap(req, desc); } /* Changed in start_io_thread, which is serialized by being called only @@ -1600,8 +1634,13 @@ int io_thread(void *arg) } for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { + struct io_thread_req *req = (*io_req_buffer)[count]; + int i; + io_count++; - do_io((*io_req_buffer)[count]); + for (i = 0; !req->error && i < req->desc_cnt; i++) + do_io(req, &(req->io_desc[i])); + } written = 0; From f4ab7818ef7add1e10b33d8c3a4fe44858b7f6e9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 4 Dec 2020 16:22:44 +0100 Subject: [PATCH 07/32] um: line: Don't free winch (with IRQ) under spinlock Lockdep correctly complains that one shouldn't call um_free_irq() with free_irq() inside under a spinlock since that will attempt to acquire a mutex. Rearrange the code to keep the list manipulations under the lock while moving the actual freeing outside of it, to avoid this. In particular, this removes the lockdep complaint at shutdown that I was seeing with lockdep enabled. Signed-off-by: Johannes Berg Acked-By: anton.ivanov@cambridgegreys.com Signed-off-by: Richard Weinberger --- arch/um/drivers/line.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 14ad9f495fe6..77ce63a57070 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -608,7 +608,6 @@ static void free_winch(struct winch *winch) winch->fd = -1; if (fd != -1) os_close_file(fd); - list_del(&winch->list); __free_winch(&winch->work); } @@ -709,6 +708,8 @@ static void unregister_winch(struct tty_struct *tty) winch = list_entry(ele, struct winch, list); wtty = tty_port_tty_get(winch->port); if (wtty == tty) { + list_del(&winch->list); + spin_unlock(&winch_handler_lock); free_winch(winch); break; } @@ -719,14 +720,17 @@ static void unregister_winch(struct tty_struct *tty) static void winch_cleanup(void) { - struct list_head *ele, *next; struct winch *winch; spin_lock(&winch_handler_lock); + while ((winch = list_first_entry_or_null(&winch_handlers, + struct winch, list))) { + list_del(&winch->list); + spin_unlock(&winch_handler_lock); - list_for_each_safe(ele, next, &winch_handlers) { - winch = list_entry(ele, struct winch, list); free_winch(winch); + + spin_lock(&winch_handler_lock); } spin_unlock(&winch_handler_lock); From 517f60206ee5d5f75c44bd9c8b1683d1d18a616a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 9 Dec 2020 20:19:15 +0200 Subject: [PATCH 08/32] um: Increase stack frame size threshold for signal.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The signal.c can't use heap for bit data located on stack. However, by default a compiler warns us about overstepping stack frame size threshold: arch/um/os-Linux/signal.c: In function ‘sig_handler_common’: arch/um/os-Linux/signal.c:51:1: warning: the frame size of 2960 bytes is larger than 2048 bytes [-Wframe-larger-than=] 51 | } | ^ arch/um/os-Linux/signal.c: In function ‘timer_real_alarm_handler’: arch/um/os-Linux/signal.c:95:1: warning: the frame size of 2960 bytes is larger than 2048 bytes [-Wframe-larger-than=] 95 | } | ^ Due to above increase stack frame size threshold explicitly for signal.c to avoid unnecessary warning. Signed-off-by: Andy Shevchenko Tested-by: David Gow Signed-off-by: Richard Weinberger --- arch/um/os-Linux/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 839915b8c31c..77ac50baa3f8 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -10,6 +10,8 @@ obj-y = execvp.o file.o helper.o irq.o main.o mem.o process.o \ registers.o sigio.o signal.o start_up.o time.o tty.o \ umid.o user_syms.o util.o drivers/ skas/ +CFLAGS_signal.o += -Wframe-larger-than=4096 + obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \ From ef3ba87cb7c911bb5073e9ad30c4b37369e1a060 Mon Sep 17 00:00:00 2001 From: Christopher Obbard Date: Mon, 23 Nov 2020 18:31:37 +0000 Subject: [PATCH 09/32] um: ubd: Set device serial attribute from cmdline Adds the ability to set the UBD device serial number from the commandline, disabling the serial number functionality by default. In some cases it may be useful to set a serial to the UBD device, such that downstream users (i.e. udev) can use this information to better describe the hardware to the user from the UML cmdline. In our case we use this parameter to create some entries under /dev/disk/by-ubd-id/ for each of the UBD devices passed through the UML cmdline. Signed-off-by: Christopher Obbard Signed-off-by: Richard Weinberger --- arch/um/drivers/ubd_kern.c | 78 ++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 16 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index b12c1b0d3e1d..3b48640f89d5 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -155,6 +155,7 @@ struct ubd { /* name (and fd, below) of the file opened for writing, either the * backing or the cow file. */ char *file; + char *serial; int count; int fd; __u64 size; @@ -180,6 +181,7 @@ struct ubd { #define DEFAULT_UBD { \ .file = NULL, \ + .serial = NULL, \ .count = 0, \ .fd = -1, \ .size = -1, \ @@ -272,7 +274,7 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out) { struct ubd *ubd_dev; struct openflags flags = global_openflags; - char *backing_file; + char *file, *backing_file, *serial; int n, err = 0, i; if(index_out) *index_out = -1; @@ -368,24 +370,27 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out) goto out; break_loop: - backing_file = strchr(str, ','); + file = strsep(&str, ",:"); + if (*file == '\0') + file = NULL; - if (backing_file == NULL) - backing_file = strchr(str, ':'); + backing_file = strsep(&str, ",:"); + if (*backing_file == '\0') + backing_file = NULL; - if(backing_file != NULL){ - if(ubd_dev->no_cow){ - *error_out = "Can't specify both 'd' and a cow file"; - goto out; - } - else { - *backing_file = '\0'; - backing_file++; - } + serial = strsep(&str, ",:"); + if (*serial == '\0') + serial = NULL; + + if (backing_file && ubd_dev->no_cow) { + *error_out = "Can't specify both 'd' and a cow file"; + goto out; } + err = 0; - ubd_dev->file = str; + ubd_dev->file = file; ubd_dev->cow.file = backing_file; + ubd_dev->serial = serial; ubd_dev->boot_openflags = flags; out: mutex_unlock(&ubd_lock); @@ -406,7 +411,7 @@ static int ubd_setup(char *str) __setup("ubd", ubd_setup); __uml_help(ubd_setup, -"ubd=[(:|,)]\n" +"ubd=[(:|,)][(:|,)]\n" " This is used to associate a device with a file in the underlying\n" " filesystem. When specifying two filenames, the first one is the\n" " COW name and the second is the backing file name. As separator you can\n" @@ -429,6 +434,12 @@ __uml_help(ubd_setup, " UMLs and file locking will be turned off - this is appropriate for a\n" " cluster filesystem and inappropriate at almost all other times.\n\n" " 't' will disable trim/discard support on the device (enabled by default).\n\n" +" An optional device serial number can be exposed using the serial parameter\n" +" on the cmdline which is exposed as a sysfs entry. This is particularly\n" +" useful when a unique number should be given to the device. Note when\n" +" specifying a label, the filename2 must be also presented. It can be\n" +" an empty string, in which case the backing file is not used:\n" +" ubd0=File,,Serial\n" ); static int udb_setup(char *str) @@ -868,6 +879,41 @@ static void ubd_device_release(struct device *dev) *ubd_dev = ((struct ubd) DEFAULT_UBD); } +static ssize_t serial_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct ubd *ubd_dev = disk->private_data; + + if (!ubd_dev) + return 0; + + return sprintf(buf, "%s", ubd_dev->serial); +} + +static DEVICE_ATTR_RO(serial); + +static struct attribute *ubd_attrs[] = { + &dev_attr_serial.attr, + NULL, +}; + +static umode_t ubd_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + return a->mode; +} + +static const struct attribute_group ubd_attr_group = { + .attrs = ubd_attrs, + .is_visible = ubd_attrs_are_visible, +}; + +static const struct attribute_group *ubd_attr_groups[] = { + &ubd_attr_group, + NULL, +}; + static int ubd_disk_register(int major, u64 size, int unit, struct gendisk **disk_out) { @@ -899,7 +945,7 @@ static int ubd_disk_register(int major, u64 size, int unit, disk->private_data = &ubd_devs[unit]; disk->queue = ubd_devs[unit].queue; - device_add_disk(parent, disk, NULL); + device_add_disk(parent, disk, ubd_attr_groups); *disk_out = disk; return 0; From d66c91836b8d7df3b6f0fe7f0c7617d28ebfcb4c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 23 Nov 2020 20:44:02 +0100 Subject: [PATCH 10/32] um: sigio: Return error from add_sigio_fd() If we run out of space, return an error instead of 0. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/os-Linux/sigio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 75558080d0bf..f91fd16e9911 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -167,15 +167,17 @@ static void update_thread(void) int add_sigio_fd(int fd) { struct pollfd *p; - int err = 0, i, n; + int err, i, n; sigio_lock(); for (i = 0; i < all_sigio_fds.used; i++) { if (all_sigio_fds.poll[i].fd == fd) break; } - if (i == all_sigio_fds.used) + if (i == all_sigio_fds.used) { + err = -ENOSPC; goto out; + } p = &all_sigio_fds.poll[i]; From 36d46a5907ba170965307c9d106cc35517acbf33 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 12:59:50 +0100 Subject: [PATCH 11/32] um: Support dynamic IRQ allocation It's cumbersome and error-prone to keep adding fixed IRQ numbers, and for proper device wakeup support for the virtio/vhost-user support we need to have different IRQs for each device. Even if in theory two IRQs (with and without wake) might be sufficient, it's much easier to reason about it when we have dynamic number assignment. It also makes it easier to add new devices that may dynamically exist or depending on the configuration, etc. Add support for this, up to 64 IRQs (the same limit as epoll FDs we have right now). Since it's not easy to port all the existing places to dynamic allocation (some data is statically initialized) keep the low numbers are reserved for the existing hard-coded IRQ numbers. Acked-By: Anton Ivanov Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/line.c | 18 ++++++++++----- arch/um/drivers/mconsole_kern.c | 2 +- arch/um/drivers/net_kern.c | 2 +- arch/um/drivers/port_kern.c | 4 ++-- arch/um/drivers/random.c | 2 +- arch/um/drivers/ubd_kern.c | 2 +- arch/um/drivers/vector_kern.c | 4 ++-- arch/um/drivers/virtio_uml.c | 4 ++-- arch/um/drivers/xterm_kern.c | 2 +- arch/um/include/asm/irq.h | 6 ++--- arch/um/include/shared/irq_kern.h | 12 +++++----- arch/um/kernel/irq.c | 37 ++++++++++++++++++++++++++----- arch/um/kernel/sigio.c | 2 +- 13 files changed, 65 insertions(+), 32 deletions(-) diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 77ce63a57070..1c70a31e7c5b 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -262,19 +262,25 @@ static irqreturn_t line_write_interrupt(int irq, void *data) int line_setup_irq(int fd, int input, int output, struct line *line, void *data) { const struct line_driver *driver = line->driver; - int err = 0; + int err; - if (input) + if (input) { err = um_request_irq(driver->read_irq, fd, IRQ_READ, line_interrupt, IRQF_SHARED, driver->read_irq_name, data); - if (err) - return err; - if (output) + if (err < 0) + return err; + } + + if (output) { err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, line_write_interrupt, IRQF_SHARED, driver->write_irq_name, data); - return err; + if (err < 0) + return err; + } + + return 0; } static int line_activate(struct tty_port *port, struct tty_struct *tty) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index a2e680f7d39f..6d00af25ec6b 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -738,7 +738,7 @@ static int __init mconsole_init(void) err = um_request_irq(MCONSOLE_IRQ, sock, IRQ_READ, mconsole_interrupt, IRQF_SHARED, "mconsole", (void *)sock); - if (err) { + if (err < 0) { printk(KERN_ERR "Failed to get IRQ for management console\n"); goto out; } diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 1802cf4ef5a5..2fc0b038ff8a 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -160,7 +160,7 @@ static int uml_net_open(struct net_device *dev) err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt, IRQF_SHARED, dev->name, dev); - if (err != 0) { + if (err < 0) { printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err); err = -ENETUNREACH; goto out_close; diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c index a47ca5376d9d..efa8b7304090 100644 --- a/arch/um/drivers/port_kern.c +++ b/arch/um/drivers/port_kern.c @@ -100,7 +100,7 @@ static int port_accept(struct port_list *port) .port = port }); if (um_request_irq(TELNETD_IRQ, socket[0], IRQ_READ, pipe_interrupt, - IRQF_SHARED, "telnetd", conn)) { + IRQF_SHARED, "telnetd", conn) < 0) { printk(KERN_ERR "port_accept : failed to get IRQ for " "telnetd\n"); goto out_free; @@ -182,7 +182,7 @@ void *port_data(int port_num) } if (um_request_irq(ACCEPT_IRQ, fd, IRQ_READ, port_interrupt, - IRQF_SHARED, "port", port)) { + IRQF_SHARED, "port", port) < 0) { printk(KERN_ERR "Failed to get IRQ for port %d\n", port_num); goto out_close; } diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index e4b9b2ce9abf..bd3059adc2fb 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -76,7 +76,7 @@ static int __init rng_init (void) random_fd = err; err = um_request_irq(RANDOM_IRQ, random_fd, IRQ_READ, random_interrupt, 0, "random", NULL); - if (err) + if (err < 0) goto err_out_cleanup_hw; sigio_broken(random_fd, 1); diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 3b48640f89d5..7e07b321e847 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1253,7 +1253,7 @@ static int __init ubd_driver_init(void){ } err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, 0, "ubd", ubd_devs); - if(err != 0) + if(err < 0) printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err); return 0; } diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index aa6ba9d61e9b..47a02e60898d 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1271,7 +1271,7 @@ static int vector_net_open(struct net_device *dev) irq_rr + VECTOR_BASE_IRQ, vp->fds->rx_fd, IRQ_READ, vector_rx_interrupt, IRQF_SHARED, dev->name, dev); - if (err != 0) { + if (err < 0) { netdev_err(dev, "vector_open: failed to get rx irq(%d)\n", err); err = -ENETUNREACH; goto out_close; @@ -1286,7 +1286,7 @@ static int vector_net_open(struct net_device *dev) irq_rr + VECTOR_BASE_IRQ, vp->fds->tx_fd, IRQ_WRITE, vector_tx_interrupt, IRQF_SHARED, dev->name, dev); - if (err != 0) { + if (err < 0) { netdev_err(dev, "vector_open: failed to get tx irq(%d)\n", err); err = -ENETUNREACH; diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index a6c4bb6c2c01..f76b8da28d20 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -412,7 +412,7 @@ static int vhost_user_init_slave_req(struct virtio_uml_device *vu_dev) rc = um_request_irq(VIRTIO_IRQ, vu_dev->req_fd, IRQ_READ, vu_req_interrupt, IRQF_SHARED, vu_dev->pdev->name, vu_dev); - if (rc) + if (rc < 0) goto err_close; rc = vhost_user_send_no_payload_fd(vu_dev, VHOST_USER_SET_SLAVE_REQ_FD, @@ -854,7 +854,7 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, info->call_fd = call_fds[0]; rc = um_request_irq(VIRTIO_IRQ, info->call_fd, IRQ_READ, vu_interrupt, IRQF_SHARED, info->name, vq); - if (rc) + if (rc < 0) goto close_both; rc = vhost_user_set_vring_call(vu_dev, vq->index, call_fds[1]); diff --git a/arch/um/drivers/xterm_kern.c b/arch/um/drivers/xterm_kern.c index d64ef6d0d463..50f11b7b4774 100644 --- a/arch/um/drivers/xterm_kern.c +++ b/arch/um/drivers/xterm_kern.c @@ -51,7 +51,7 @@ int xterm_fd(int socket, int *pid_out) err = um_request_irq(XTERM_IRQ, socket, IRQ_READ, xterm_interrupt, IRQF_SHARED, "xterm", data); - if (err) { + if (err < 0) { printk(KERN_ERR "xterm_fd : failed to get IRQ for xterm, " "err = %d\n", err); ret = err; diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h index 42c6205e2dc4..b6fa6301c75b 100644 --- a/arch/um/include/asm/irq.h +++ b/arch/um/include/asm/irq.h @@ -24,14 +24,14 @@ #define VECTOR_BASE_IRQ (VIRTIO_IRQ + 1) #define VECTOR_IRQ_SPACE 8 -#define LAST_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ - 1) +#define UM_FIRST_DYN_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ) #else -#define LAST_IRQ VIRTIO_IRQ +#define UM_FIRST_DYN_IRQ (VIRTIO_IRQ + 1) #endif -#define NR_IRQS (LAST_IRQ + 1) +#define NR_IRQS 64 #endif diff --git a/arch/um/include/shared/irq_kern.h b/arch/um/include/shared/irq_kern.h index 7cd1a10c6244..7c04a0fd3a27 100644 --- a/arch/um/include/shared/irq_kern.h +++ b/arch/um/include/shared/irq_kern.h @@ -9,10 +9,10 @@ #include #include -extern int um_request_irq(unsigned int irq, int fd, int type, - irq_handler_t handler, - unsigned long irqflags, const char * devname, - void *dev_id); -void um_free_irq(unsigned int irq, void *dev); -#endif +#define UM_IRQ_ALLOC -1 +int um_request_irq(int irq, int fd, int type, irq_handler_t handler, + unsigned long irqflags, const char * devname, + void *dev_id); +void um_free_irq(int irq, void *dev_id); +#endif diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 3577118bb4a5..b94c72f56617 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -19,6 +19,7 @@ #include #include #include +#include extern void free_irqs(void); @@ -38,6 +39,7 @@ struct irq_entry { static struct irq_entry *active_fds; static DEFINE_SPINLOCK(irq_lock); +static DECLARE_BITMAP(irqs_allocated, NR_IRQS); static void irq_io_loop(struct irq_fd *irq, struct uml_pt_regs *regs) { @@ -421,27 +423,52 @@ unsigned int do_IRQ(int irq, struct uml_pt_regs *regs) return 1; } -void um_free_irq(unsigned int irq, void *dev) +void um_free_irq(int irq, void *dev) { + if (WARN(irq < 0 || irq > NR_IRQS, "freeing invalid irq %d", irq)) + return; + free_irq_by_irq_and_dev(irq, dev); free_irq(irq, dev); + clear_bit(irq, irqs_allocated); } EXPORT_SYMBOL(um_free_irq); -int um_request_irq(unsigned int irq, int fd, int type, +int um_request_irq(int irq, int fd, int type, irq_handler_t handler, unsigned long irqflags, const char * devname, void *dev_id) { int err; + if (irq == UM_IRQ_ALLOC) { + int i; + + for (i = UM_FIRST_DYN_IRQ; i < NR_IRQS; i++) { + if (!test_and_set_bit(i, irqs_allocated)) { + irq = i; + break; + } + } + } + + if (irq < 0) + return -ENOSPC; + if (fd != -1) { err = activate_fd(irq, fd, type, dev_id); if (err) - return err; + goto error; } - return request_irq(irq, handler, irqflags, devname, dev_id); + err = request_irq(irq, handler, irqflags, devname, dev_id); + if (err < 0) + goto error; + + return irq; +error: + clear_bit(irq, irqs_allocated); + return err; } EXPORT_SYMBOL(um_request_irq); @@ -480,7 +507,7 @@ void __init init_IRQ(void) irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq); - for (i = 1; i <= LAST_IRQ; i++) + for (i = 1; i < NR_IRQS; i++) irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq); /* Initialize EPOLL Loop */ os_setup_epoll(); diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index d1cffc2a7f21..5085a50c3b8c 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -25,7 +25,7 @@ int write_sigio_irq(int fd) err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, 0, "write sigio", NULL); - if (err) { + if (err < 0) { printk(KERN_ERR "write_sigio_irq : um_request_irq failed, " "err = %d\n", err); return -1; From aaf5800e249fc4f4a89d1025ef7f0b330fb64cb8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 12:59:51 +0100 Subject: [PATCH 12/32] um: virtio: Use dynamic IRQ allocation This separates the devices, which is better for debug and for later suspend/resume and wakeup support, since there we'll have to separate which IRQs can wake up the system and which cannot. Acked-By: Anton Ivanov Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/virtio_uml.c | 22 ++++++++++++++-------- arch/um/include/asm/irq.h | 5 ++--- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index f76b8da28d20..94b112749d5b 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -55,7 +55,7 @@ struct virtio_uml_device { struct platform_device *pdev; spinlock_t sock_lock; - int sock, req_fd; + int sock, req_fd, irq; u64 features; u64 protocol_features; u8 status; @@ -409,12 +409,14 @@ static int vhost_user_init_slave_req(struct virtio_uml_device *vu_dev) return rc; vu_dev->req_fd = req_fds[0]; - rc = um_request_irq(VIRTIO_IRQ, vu_dev->req_fd, IRQ_READ, + rc = um_request_irq(UM_IRQ_ALLOC, vu_dev->req_fd, IRQ_READ, vu_req_interrupt, IRQF_SHARED, vu_dev->pdev->name, vu_dev); if (rc < 0) goto err_close; + vu_dev->irq = rc; + rc = vhost_user_send_no_payload_fd(vu_dev, VHOST_USER_SET_SLAVE_REQ_FD, req_fds[1]); if (rc) @@ -423,7 +425,7 @@ static int vhost_user_init_slave_req(struct virtio_uml_device *vu_dev) goto out; err_free_irq: - um_free_irq(VIRTIO_IRQ, vu_dev); + um_free_irq(vu_dev->irq, vu_dev); err_close: os_close_file(req_fds[0]); out: @@ -802,7 +804,11 @@ static void vu_del_vq(struct virtqueue *vq) struct virtio_uml_vq_info *info = vq->priv; if (info->call_fd >= 0) { - um_free_irq(VIRTIO_IRQ, vq); + struct virtio_uml_device *vu_dev; + + vu_dev = to_virtio_uml_device(vq->vdev); + + um_free_irq(vu_dev->irq, vq); os_close_file(info->call_fd); } @@ -852,7 +858,7 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, return rc; info->call_fd = call_fds[0]; - rc = um_request_irq(VIRTIO_IRQ, info->call_fd, IRQ_READ, + rc = um_request_irq(vu_dev->irq, info->call_fd, IRQ_READ, vu_interrupt, IRQF_SHARED, info->name, vq); if (rc < 0) goto close_both; @@ -864,7 +870,7 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, goto out; release_irq: - um_free_irq(VIRTIO_IRQ, vq); + um_free_irq(vu_dev->irq, vq); close_both: os_close_file(call_fds[0]); out: @@ -969,7 +975,7 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, error_setup: if (info->call_fd >= 0) { - um_free_irq(VIRTIO_IRQ, vq); + um_free_irq(vu_dev->irq, vq); os_close_file(info->call_fd); } error_call: @@ -1078,7 +1084,7 @@ static void virtio_uml_release_dev(struct device *d) /* might not have been opened due to not negotiating the feature */ if (vu_dev->req_fd >= 0) { - um_free_irq(VIRTIO_IRQ, vu_dev); + um_free_irq(vu_dev->irq, vu_dev); os_close_file(vu_dev->req_fd); } diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h index b6fa6301c75b..547bff7b3a89 100644 --- a/arch/um/include/asm/irq.h +++ b/arch/um/include/asm/irq.h @@ -17,18 +17,17 @@ #define TELNETD_IRQ 12 #define XTERM_IRQ 13 #define RANDOM_IRQ 14 -#define VIRTIO_IRQ 15 #ifdef CONFIG_UML_NET_VECTOR -#define VECTOR_BASE_IRQ (VIRTIO_IRQ + 1) +#define VECTOR_BASE_IRQ (RANDOM_IRQ + 1) #define VECTOR_IRQ_SPACE 8 #define UM_FIRST_DYN_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ) #else -#define UM_FIRST_DYN_IRQ (VIRTIO_IRQ + 1) +#define UM_FIRST_DYN_IRQ (RANDOM_IRQ + 1) #endif From 0ede3c05eec875d05a397d16808857492d206dcf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 12:59:52 +0100 Subject: [PATCH 13/32] um: Clean up alarm IRQ chip name We don't use "SIGVTALRM", it's just SIGALRM. Clean up the naming. While at it, fix the comment's grammar. Acked-By: Anton Ivanov Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/kernel/irq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index b94c72f56617..97ff77c297c8 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -481,7 +481,7 @@ static void dummy(struct irq_data *d) { } -/* This is used for everything else than the timer. */ +/* This is used for everything other than the timer. */ static struct irq_chip normal_irq_type = { .name = "SIGIO", .irq_disable = dummy, @@ -491,8 +491,8 @@ static struct irq_chip normal_irq_type = { .irq_unmask = dummy, }; -static struct irq_chip SIGVTALRM_irq_type = { - .name = "SIGVTALRM", +static struct irq_chip alarm_irq_type = { + .name = "SIGALRM", .irq_disable = dummy, .irq_enable = dummy, .irq_ack = dummy, @@ -504,8 +504,7 @@ void __init init_IRQ(void) { int i; - irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq); - + irq_set_chip_and_handler(TIMER_IRQ, &alarm_irq_type, handle_edge_irq); for (i = 1; i < NR_IRQS; i++) irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq); From 458e1f7da004ec264cf2a9252822955ba4f7e9a0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 12:59:53 +0100 Subject: [PATCH 14/32] um: irq: Clean up and rename struct irq_fd This really shouldn't be called "irq_fd" since it doesn't carry an fd. Well, it used to, apparently, but that struct member is unused. Rename it to "irq_reg" since it more accurately reflects a registered interrupt, and remove the unused 'next' and 'fd' members from the struct as well. While at it, also move it to the implementation, it's not used anywhere else, and the header file is shared with the userspace components. Acked-By: Anton Ivanov Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/shared/irq_user.h | 14 ------------- arch/um/kernel/irq.c | 34 ++++++++++++++++++++----------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index 107751dce153..2dd5fd7d9443 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -9,25 +9,11 @@ #include #include -struct irq_fd { - struct irq_fd *next; - void *id; - int fd; - int type; - int irq; - int events; - bool active; - bool pending; - bool purge; -}; - #define IRQ_READ 0 #define IRQ_WRITE 1 #define IRQ_NONE 2 #define MAX_IRQ_TYPE (IRQ_NONE + 1) - - struct siginfo; extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); extern void free_irq_by_fd(int fd); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 97ff77c297c8..923a80c9808a 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -26,14 +26,24 @@ extern void free_irqs(void); /* When epoll triggers we do not know why it did so * we can also have different IRQs for read and write. - * This is why we keep a small irq_fd array for each fd - + * This is why we keep a small irq_reg array for each fd - * one entry per IRQ type */ +struct irq_reg { + void *id; + int type; + int irq; + int events; + bool active; + bool pending; + bool purge; +}; + struct irq_entry { struct irq_entry *next; int fd; - struct irq_fd *irq_array[MAX_IRQ_TYPE + 1]; + struct irq_reg *irq_array[MAX_IRQ_TYPE + 1]; }; static struct irq_entry *active_fds; @@ -41,7 +51,7 @@ static struct irq_entry *active_fds; static DEFINE_SPINLOCK(irq_lock); static DECLARE_BITMAP(irqs_allocated, NR_IRQS); -static void irq_io_loop(struct irq_fd *irq, struct uml_pt_regs *regs) +static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) { /* * irq->active guards against reentry @@ -65,7 +75,7 @@ static void irq_io_loop(struct irq_fd *irq, struct uml_pt_regs *regs) void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { struct irq_entry *irq_entry; - struct irq_fd *irq; + struct irq_reg *irq; int n, i, j; @@ -86,7 +96,7 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) } for (i = 0; i < n ; i++) { - /* Epoll back reference is the entry with 3 irq_fd + /* Epoll back reference is the entry with 3 irq_reg * leaves - one for each irq type. */ irq_entry = (struct irq_entry *) @@ -112,7 +122,7 @@ static int assign_epoll_events_to_irq(struct irq_entry *irq_entry) { int i; int events = 0; - struct irq_fd *irq; + struct irq_reg *irq; for (i = 0; i < MAX_IRQ_TYPE ; i++) { irq = irq_entry->irq_array[i]; @@ -131,7 +141,7 @@ static int assign_epoll_events_to_irq(struct irq_entry *irq_entry) static int activate_fd(int irq, int fd, int type, void *dev_id) { - struct irq_fd *new_fd; + struct irq_reg *new_fd; struct irq_entry *irq_entry; int i, err, events; unsigned long flags; @@ -182,13 +192,13 @@ static int activate_fd(int irq, int fd, int type, void *dev_id) /* New entry for this fd */ err = -ENOMEM; - new_fd = kmalloc(sizeof(struct irq_fd), GFP_ATOMIC); + new_fd = kmalloc(sizeof(struct irq_reg), GFP_ATOMIC); if (new_fd == NULL) goto out_unlock; events = os_event_mask(type); - *new_fd = ((struct irq_fd) { + *new_fd = ((struct irq_reg) { .id = dev_id, .irq = irq, .type = type, @@ -273,8 +283,8 @@ static struct irq_entry *get_irq_entry_by_fd(int fd) /* * Walk the IRQ list and dispose of an entry for a specific - * device, fd and number. Note - if sharing an IRQ for read - * and writefor the same FD it will be disposed in either case. + * device and number. Note - if sharing an IRQ for read + * and write for the same FD it will be disposed in either case. * If this behaviour is undesirable use different IRQ ids. */ @@ -289,7 +299,7 @@ static void do_free_by_irq_and_dev( ) { int i; - struct irq_fd *to_free; + struct irq_reg *to_free; for (i = 0; i < MAX_IRQ_TYPE ; i++) { if (irq_entry->irq_array[i] != NULL) { From 0737402f42d3cdc7b7ef27e8cc7caf1e9ba2a2bc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 12:59:54 +0100 Subject: [PATCH 15/32] um: irq: Reduce irq_reg allocation We don't need an array of 4 entries to capture three and the name 'MAX_IRQ_TYPE' really gets confusing as well. Remove it and add a correct NUM_IRQ_TYPES, and use that correctly. Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/shared/irq_user.h | 2 +- arch/um/kernel/irq.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index 2dd5fd7d9443..5e975a9e8354 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -12,7 +12,7 @@ #define IRQ_READ 0 #define IRQ_WRITE 1 #define IRQ_NONE 2 -#define MAX_IRQ_TYPE (IRQ_NONE + 1) +#define NUM_IRQ_TYPES (IRQ_NONE + 1) struct siginfo; extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 923a80c9808a..93eb742ecafe 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -43,7 +43,7 @@ struct irq_reg { struct irq_entry { struct irq_entry *next; int fd; - struct irq_reg *irq_array[MAX_IRQ_TYPE + 1]; + struct irq_reg *irq_array[NUM_IRQ_TYPES]; }; static struct irq_entry *active_fds; @@ -101,7 +101,7 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) */ irq_entry = (struct irq_entry *) os_epoll_get_data_pointer(i); - for (j = 0; j < MAX_IRQ_TYPE ; j++) { + for (j = 0; j < NUM_IRQ_TYPES ; j++) { irq = irq_entry->irq_array[j]; if (irq == NULL) continue; @@ -124,7 +124,7 @@ static int assign_epoll_events_to_irq(struct irq_entry *irq_entry) int events = 0; struct irq_reg *irq; - for (i = 0; i < MAX_IRQ_TYPE ; i++) { + for (i = 0; i < NUM_IRQ_TYPES ; i++) { irq = irq_entry->irq_array[i]; if (irq != NULL) events = irq->events | events; @@ -172,7 +172,7 @@ static int activate_fd(int irq, int fd, int type, void *dev_id) goto out_unlock; } irq_entry->fd = fd; - for (i = 0; i < MAX_IRQ_TYPE; i++) + for (i = 0; i < NUM_IRQ_TYPES; i++) irq_entry->irq_array[i] = NULL; irq_entry->next = active_fds; active_fds = irq_entry; @@ -244,7 +244,7 @@ static void garbage_collect_irq_entries(void) walk = active_fds; while (walk != NULL) { reap = true; - for (i = 0; i < MAX_IRQ_TYPE ; i++) { + for (i = 0; i < NUM_IRQ_TYPES ; i++) { if (walk->irq_array[i] != NULL) { reap = false; break; @@ -301,7 +301,7 @@ static void do_free_by_irq_and_dev( int i; struct irq_reg *to_free; - for (i = 0; i < MAX_IRQ_TYPE ; i++) { + for (i = 0; i < NUM_IRQ_TYPES ; i++) { if (irq_entry->irq_array[i] != NULL) { if ( ((flags & IGNORE_IRQ) || From 2fccfcc0c742625c01e6a3913f4fc2d330541fbb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 12:59:55 +0100 Subject: [PATCH 16/32] um: Remove IRQ_NONE type We don't actually use this in um_request_irq(), so it can never be assigned. It's also not clear what that would be useful for, so just remove it. This results in quite a number of cleanups, all the way to removing the "SIGIO on close" startup check, since the data it assigns (pty_close_sigio) is not used anymore. While at it, also make this an enum so we get a minimum of type checking, and remove the IRQ_NONE hack in virtio since we now no longer have the name twice. Acked-By: Anton Ivanov Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/random.c | 2 +- arch/um/drivers/virtio_uml.c | 5 ----- arch/um/include/shared/irq_kern.h | 7 ++++--- arch/um/include/shared/irq_user.h | 9 +++++---- arch/um/include/shared/os.h | 6 +++--- arch/um/kernel/irq.c | 15 +++++++-------- arch/um/os-Linux/irq.c | 2 +- arch/um/os-Linux/sigio.c | 25 +++++-------------------- 8 files changed, 26 insertions(+), 45 deletions(-) diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index bd3059adc2fb..433a3f8f2ef3 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -79,7 +79,7 @@ static int __init rng_init (void) if (err < 0) goto err_out_cleanup_hw; - sigio_broken(random_fd, 1); + sigio_broken(random_fd); hwrng.name = RNG_MODULE_NAME; hwrng.read = rng_dev_read; hwrng.quality = 1024; diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 94b112749d5b..27e92d3881ff 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -33,11 +33,6 @@ #include #include "vhost_user.h" -/* Workaround due to a conflict between irq_user.h and irqreturn.h */ -#ifdef IRQ_NONE -#undef IRQ_NONE -#endif - #define MAX_SUPPORTED_QUEUE_SIZE 256 #define to_virtio_uml_device(_vdev) \ diff --git a/arch/um/include/shared/irq_kern.h b/arch/um/include/shared/irq_kern.h index 7c04a0fd3a27..7807de593bda 100644 --- a/arch/um/include/shared/irq_kern.h +++ b/arch/um/include/shared/irq_kern.h @@ -8,11 +8,12 @@ #include #include +#include "irq_user.h" #define UM_IRQ_ALLOC -1 -int um_request_irq(int irq, int fd, int type, irq_handler_t handler, - unsigned long irqflags, const char * devname, - void *dev_id); +int um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id); void um_free_irq(int irq, void *dev_id); #endif diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index 5e975a9e8354..07239e801a5b 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -9,10 +9,11 @@ #include #include -#define IRQ_READ 0 -#define IRQ_WRITE 1 -#define IRQ_NONE 2 -#define NUM_IRQ_TYPES (IRQ_NONE + 1) +enum um_irq_type { + IRQ_READ, + IRQ_WRITE, + NUM_IRQ_TYPES, +}; struct siginfo; extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index f467d28fc0b4..e2bb7e488d59 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -299,7 +299,7 @@ extern void reboot_skas(void); extern int os_waiting_for_events_epoll(void); extern void *os_epoll_get_data_pointer(int index); extern int os_epoll_triggered(int index, int events); -extern int os_event_mask(int irq_type); +extern int os_event_mask(enum um_irq_type irq_type); extern int os_setup_epoll(void); extern int os_add_epoll_fd(int events, int fd, void *data); extern int os_mod_epoll_fd(int events, int fd, void *data); @@ -310,8 +310,8 @@ extern void os_close_epoll_fd(void); /* sigio.c */ extern int add_sigio_fd(int fd); extern int ignore_sigio_fd(int fd); -extern void maybe_sigio_broken(int fd, int read); -extern void sigio_broken(int fd, int read); +extern void maybe_sigio_broken(int fd); +extern void sigio_broken(int fd); /* prctl.c */ extern int os_arch_prctl(int pid, int option, unsigned long *arg2); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 93eb742ecafe..9e8f776bb43a 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -32,7 +32,7 @@ extern void free_irqs(void); struct irq_reg { void *id; - int type; + enum um_irq_type type; int irq; int events; bool active; @@ -96,7 +96,7 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) } for (i = 0; i < n ; i++) { - /* Epoll back reference is the entry with 3 irq_reg + /* Epoll back reference is the entry with 2 irq_reg * leaves - one for each irq type. */ irq_entry = (struct irq_entry *) @@ -139,7 +139,7 @@ static int assign_epoll_events_to_irq(struct irq_entry *irq_entry) -static int activate_fd(int irq, int fd, int type, void *dev_id) +static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id) { struct irq_reg *new_fd; struct irq_entry *irq_entry; @@ -217,7 +217,7 @@ static int activate_fd(int irq, int fd, int type, void *dev_id) /* Turn back IO on with the correct (new) IO event mask */ assign_epoll_events_to_irq(irq_entry); spin_unlock_irqrestore(&irq_lock, flags); - maybe_sigio_broken(fd, (type != IRQ_NONE)); + maybe_sigio_broken(fd); return 0; out_unlock: @@ -444,10 +444,9 @@ void um_free_irq(int irq, void *dev) } EXPORT_SYMBOL(um_free_irq); -int um_request_irq(int irq, int fd, int type, - irq_handler_t handler, - unsigned long irqflags, const char * devname, - void *dev_id) +int um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) { int err; diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c index d508310ee5e1..aa90a05b3d78 100644 --- a/arch/um/os-Linux/irq.c +++ b/arch/um/os-Linux/irq.c @@ -45,7 +45,7 @@ int os_epoll_triggered(int index, int events) * access to the right includes/defines for EPOLL constants. */ -int os_event_mask(int irq_type) +int os_event_mask(enum um_irq_type irq_type) { if (irq_type == IRQ_READ) return EPOLLIN | EPOLLPRI; diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index f91fd16e9911..79cd6d6d6211 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -338,7 +338,7 @@ out_close1: close(l_write_sigio_fds[1]); } -void sigio_broken(int fd, int read) +void sigio_broken(int fd) { int err; @@ -354,7 +354,7 @@ void sigio_broken(int fd, int read) all_sigio_fds.poll[all_sigio_fds.used++] = ((struct pollfd) { .fd = fd, - .events = read ? POLLIN : POLLOUT, + .events = POLLIN, .revents = 0 }); out: sigio_unlock(); @@ -362,17 +362,16 @@ out: /* Changed during early boot */ static int pty_output_sigio; -static int pty_close_sigio; -void maybe_sigio_broken(int fd, int read) +void maybe_sigio_broken(int fd) { if (!isatty(fd)) return; - if ((read || pty_output_sigio) && (!read || pty_close_sigio)) + if (pty_output_sigio) return; - sigio_broken(fd, read); + sigio_broken(fd); } static void sigio_cleanup(void) @@ -516,19 +515,6 @@ static void tty_output(int master, int slave) printk(UM_KERN_CONT "tty_output : read failed, err = %d\n", n); } -static void tty_close(int master, int slave) -{ - printk(UM_KERN_INFO "Checking that host ptys support SIGIO on " - "close..."); - - close(slave); - if (got_sigio) { - printk(UM_KERN_CONT "Yes\n"); - pty_close_sigio = 1; - } else - printk(UM_KERN_CONT "No, enabling workaround\n"); -} - static void __init check_sigio(void) { if ((access("/dev/ptmx", R_OK) < 0) && @@ -538,7 +524,6 @@ static void __init check_sigio(void) return; } check_one_sigio(tty_output); - check_one_sigio(tty_close); } /* Here because it only does the SIGIO testing for now */ From 3032b94587c78c52173a9b8488d15528481ffcdb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 12:59:56 +0100 Subject: [PATCH 17/32] um: Simplify IRQ handling code Reduce dynamic allocations (and thereby cache misses) by simply embedding the registration data for IRQs in the irq_entry, we never supported these being really dynamic anyway as only one was ever allowed ("Trying to reregister ..."). Lockless behaviour is preserved by removing the FD from the poll set appropriately, but we use reg->events to indicate whether or not this entry is used, rather than dynamically allocating them. Also port the list of IRQ entries to list_head instead of the current open-coded singly-linked list implementation, just for sanity. Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/kernel/irq.c | 370 +++++++++++++++---------------------------- 1 file changed, 128 insertions(+), 242 deletions(-) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 9e8f776bb43a..482269580b79 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -29,26 +29,23 @@ extern void free_irqs(void); * This is why we keep a small irq_reg array for each fd - * one entry per IRQ type */ - struct irq_reg { void *id; - enum um_irq_type type; int irq; + /* it's cheaper to store this than to query it */ int events; bool active; bool pending; - bool purge; }; struct irq_entry { - struct irq_entry *next; + struct list_head list; int fd; - struct irq_reg *irq_array[NUM_IRQ_TYPES]; + struct irq_reg reg[NUM_IRQ_TYPES]; }; -static struct irq_entry *active_fds; - static DEFINE_SPINLOCK(irq_lock); +static LIST_HEAD(active_fds); static DECLARE_BITMAP(irqs_allocated, NR_IRQS); static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) @@ -61,12 +58,13 @@ static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) */ if (irq->active) { irq->active = false; + do { irq->pending = false; do_IRQ(irq->irq, regs); - } while (irq->pending && (!irq->purge)); - if (!irq->purge) - irq->active = true; + } while (irq->pending); + + irq->active = true; } else { irq->pending = true; } @@ -75,9 +73,7 @@ static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { struct irq_entry *irq_entry; - struct irq_reg *irq; - - int n, i, j; + int n, i; while (1) { /* This is now lockless - epoll keeps back-referencesto the irqs @@ -96,21 +92,18 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) } for (i = 0; i < n ; i++) { - /* Epoll back reference is the entry with 2 irq_reg - * leaves - one for each irq type. - */ - irq_entry = (struct irq_entry *) - os_epoll_get_data_pointer(i); - for (j = 0; j < NUM_IRQ_TYPES ; j++) { - irq = irq_entry->irq_array[j]; - if (irq == NULL) + enum um_irq_type t; + + irq_entry = os_epoll_get_data_pointer(i); + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + int events = irq_entry->reg[t].events; + + if (!events) continue; - if (os_epoll_triggered(i, irq->events) > 0) - irq_io_loop(irq, regs); - if (irq->purge) { - irq_entry->irq_array[j] = NULL; - kfree(irq); - } + + if (os_epoll_triggered(i, events) > 0) + irq_io_loop(&irq_entry->reg[t], regs); } } } @@ -118,32 +111,59 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) free_irqs(); } -static int assign_epoll_events_to_irq(struct irq_entry *irq_entry) +static struct irq_entry *get_irq_entry_by_fd(int fd) { - int i; - int events = 0; - struct irq_reg *irq; + struct irq_entry *walk; - for (i = 0; i < NUM_IRQ_TYPES ; i++) { - irq = irq_entry->irq_array[i]; - if (irq != NULL) - events = irq->events | events; + lockdep_assert_held(&irq_lock); + + list_for_each_entry(walk, &active_fds, list) { + if (walk->fd == fd) + return walk; } - if (events > 0) { - /* os_add_epoll will call os_mod_epoll if this already exists */ - return os_add_epoll_fd(events, irq_entry->fd, irq_entry); - } - /* No events - delete */ - return os_del_epoll_fd(irq_entry->fd); + + return NULL; } +static void free_irq_entry(struct irq_entry *to_free, bool remove) +{ + if (!to_free) + return; + if (remove) + os_del_epoll_fd(to_free->fd); + list_del(&to_free->list); + kfree(to_free); +} + +static bool update_irq_entry(struct irq_entry *entry) +{ + enum um_irq_type i; + int events = 0; + + for (i = 0; i < NUM_IRQ_TYPES; i++) + events |= entry->reg[i].events; + + if (events) { + /* will modify (instead of add) if needed */ + os_add_epoll_fd(events, entry->fd, entry); + return true; + } + + os_del_epoll_fd(entry->fd); + return false; +} + +static void update_or_free_irq_entry(struct irq_entry *entry) +{ + if (!update_irq_entry(entry)) + free_irq_entry(entry, false); +} static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id) { - struct irq_reg *new_fd; struct irq_entry *irq_entry; - int i, err, events; + int err, events = os_event_mask(type); unsigned long flags; err = os_set_fd_async(fd); @@ -151,73 +171,34 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id) goto out; spin_lock_irqsave(&irq_lock, flags); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + /* cannot register the same FD twice with the same type */ + if (WARN_ON(irq_entry->reg[type].events)) { + err = -EALREADY; + goto out_unlock; + } - /* Check if we have an entry for this fd */ - - err = -EBUSY; - for (irq_entry = active_fds; - irq_entry != NULL; irq_entry = irq_entry->next) { - if (irq_entry->fd == fd) - break; - } - - if (irq_entry == NULL) { - /* This needs to be atomic as it may be called from an - * IRQ context. - */ - irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC); - if (irq_entry == NULL) { - printk(KERN_ERR - "Failed to allocate new IRQ entry\n"); + /* temporarily disable to avoid IRQ-side locking */ + os_del_epoll_fd(fd); + } else { + irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!irq_entry) { + err = -ENOMEM; goto out_unlock; } irq_entry->fd = fd; - for (i = 0; i < NUM_IRQ_TYPES; i++) - irq_entry->irq_array[i] = NULL; - irq_entry->next = active_fds; - active_fds = irq_entry; + list_add_tail(&irq_entry->list, &active_fds); + maybe_sigio_broken(fd); } - /* Check if we are trying to re-register an interrupt for a - * particular fd - */ + irq_entry->reg[type].id = dev_id; + irq_entry->reg[type].irq = irq; + irq_entry->reg[type].active = true; + irq_entry->reg[type].events = events; - if (irq_entry->irq_array[type] != NULL) { - printk(KERN_ERR - "Trying to reregister IRQ %d FD %d TYPE %d ID %p\n", - irq, fd, type, dev_id - ); - goto out_unlock; - } else { - /* New entry for this fd */ - - err = -ENOMEM; - new_fd = kmalloc(sizeof(struct irq_reg), GFP_ATOMIC); - if (new_fd == NULL) - goto out_unlock; - - events = os_event_mask(type); - - *new_fd = ((struct irq_reg) { - .id = dev_id, - .irq = irq, - .type = type, - .events = events, - .active = true, - .pending = false, - .purge = false - }); - /* Turn off any IO on this fd - allows us to - * avoid locking the IRQ loop - */ - os_del_epoll_fd(irq_entry->fd); - irq_entry->irq_array[type] = new_fd; - } - - /* Turn back IO on with the correct (new) IO event mask */ - assign_epoll_events_to_irq(irq_entry); + WARN_ON(!update_irq_entry(irq_entry)); spin_unlock_irqrestore(&irq_lock, flags); - maybe_sigio_broken(fd); return 0; out_unlock: @@ -227,104 +208,10 @@ out: } /* - * Walk the IRQ list and dispose of any unused entries. - * Should be done under irq_lock. + * Remove the entry or entries for a specific FD, if you + * don't want to remove all the possible entries then use + * um_free_irq() or deactivate_fd() instead. */ - -static void garbage_collect_irq_entries(void) -{ - int i; - bool reap; - struct irq_entry *walk; - struct irq_entry *previous = NULL; - struct irq_entry *to_free; - - if (active_fds == NULL) - return; - walk = active_fds; - while (walk != NULL) { - reap = true; - for (i = 0; i < NUM_IRQ_TYPES ; i++) { - if (walk->irq_array[i] != NULL) { - reap = false; - break; - } - } - if (reap) { - if (previous == NULL) - active_fds = walk->next; - else - previous->next = walk->next; - to_free = walk; - } else { - to_free = NULL; - } - walk = walk->next; - kfree(to_free); - } -} - -/* - * Walk the IRQ list and get the descriptor for our FD - */ - -static struct irq_entry *get_irq_entry_by_fd(int fd) -{ - struct irq_entry *walk = active_fds; - - while (walk != NULL) { - if (walk->fd == fd) - return walk; - walk = walk->next; - } - return NULL; -} - - -/* - * Walk the IRQ list and dispose of an entry for a specific - * device and number. Note - if sharing an IRQ for read - * and write for the same FD it will be disposed in either case. - * If this behaviour is undesirable use different IRQ ids. - */ - -#define IGNORE_IRQ 1 -#define IGNORE_DEV (1<<1) - -static void do_free_by_irq_and_dev( - struct irq_entry *irq_entry, - unsigned int irq, - void *dev, - int flags -) -{ - int i; - struct irq_reg *to_free; - - for (i = 0; i < NUM_IRQ_TYPES ; i++) { - if (irq_entry->irq_array[i] != NULL) { - if ( - ((flags & IGNORE_IRQ) || - (irq_entry->irq_array[i]->irq == irq)) && - ((flags & IGNORE_DEV) || - (irq_entry->irq_array[i]->id == dev)) - ) { - /* Turn off any IO on this fd - allows us to - * avoid locking the IRQ loop - */ - os_del_epoll_fd(irq_entry->fd); - to_free = irq_entry->irq_array[i]; - irq_entry->irq_array[i] = NULL; - assign_epoll_events_to_irq(irq_entry); - if (to_free->active) - to_free->purge = true; - else - kfree(to_free); - } - } - } -} - void free_irq_by_fd(int fd) { struct irq_entry *to_free; @@ -332,58 +219,64 @@ void free_irq_by_fd(int fd) spin_lock_irqsave(&irq_lock, flags); to_free = get_irq_entry_by_fd(fd); - if (to_free != NULL) { - do_free_by_irq_and_dev( - to_free, - -1, - NULL, - IGNORE_IRQ | IGNORE_DEV - ); - } - garbage_collect_irq_entries(); + free_irq_entry(to_free, true); spin_unlock_irqrestore(&irq_lock, flags); } EXPORT_SYMBOL(free_irq_by_fd); static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_entry *to_free; + struct irq_entry *entry; unsigned long flags; spin_lock_irqsave(&irq_lock, flags); - to_free = active_fds; - while (to_free != NULL) { - do_free_by_irq_and_dev( - to_free, - irq, - dev, - 0 - ); - to_free = to_free->next; + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type i; + + for (i = 0; i < NUM_IRQ_TYPES; i++) { + struct irq_reg *reg = &entry->reg[i]; + + if (!reg->events) + continue; + if (reg->irq != irq) + continue; + if (reg->id != dev) + continue; + + os_del_epoll_fd(entry->fd); + reg->events = 0; + update_or_free_irq_entry(entry); + goto out; + } } - garbage_collect_irq_entries(); +out: spin_unlock_irqrestore(&irq_lock, flags); } - void deactivate_fd(int fd, int irqnum) { - struct irq_entry *to_free; + struct irq_entry *entry; unsigned long flags; + enum um_irq_type i; os_del_epoll_fd(fd); + spin_lock_irqsave(&irq_lock, flags); - to_free = get_irq_entry_by_fd(fd); - if (to_free != NULL) { - do_free_by_irq_and_dev( - to_free, - irqnum, - NULL, - IGNORE_DEV - ); + entry = get_irq_entry_by_fd(fd); + if (!entry) + goto out; + + for (i = 0; i < NUM_IRQ_TYPES; i++) { + if (!entry->reg[i].events) + continue; + if (entry->reg[i].irq == irqnum) + entry->reg[i].events = 0; } - garbage_collect_irq_entries(); + + update_or_free_irq_entry(entry); +out: spin_unlock_irqrestore(&irq_lock, flags); + ignore_sigio_fd(fd); } EXPORT_SYMBOL(deactivate_fd); @@ -396,24 +289,17 @@ EXPORT_SYMBOL(deactivate_fd); */ int deactivate_all_fds(void) { - struct irq_entry *to_free; + struct irq_entry *entry; /* Stop IO. The IRQ loop has no lock so this is our * only way of making sure we are safe to dispose * of all IRQ handlers */ os_set_ioignore(); - to_free = active_fds; - while (to_free != NULL) { - do_free_by_irq_and_dev( - to_free, - -1, - NULL, - IGNORE_IRQ | IGNORE_DEV - ); - to_free = to_free->next; - } - /* don't garbage collect - we can no longer call kfree() here */ + + /* we can no longer call kfree() here so just deactivate */ + list_for_each_entry(entry, &active_fds, list) + os_del_epoll_fd(entry->fd); os_close_epoll_fd(); return 0; } From 49da38a3ef330b7a1643e12c51913d58158e5abe Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 20:58:04 +0100 Subject: [PATCH 18/32] um: Simplify os_idle_sleep() and sleep longer There really is no reason to pass the amount of time we should sleep, especially since it's just hard-coded to one second. Additionally, one second isn't really all that long, and as we are expecting to be woken up by a signal, we can sleep longer and avoid doing some work every second, so replace the current clock_nanosleep() with just an empty select() that can _only_ be woken up by a signal. We can also remove the deliver_alarm() since we don't need to do that when we got e.g. SIGIO that woke us up, and if we got SIGALRM the signal handler will actually (have) run, so it's just unnecessary extra work. Similarly, in time-travel mode, just program the wakeup event from idle to be S64_MAX, which is basically the most you could ever simulate to. Of course, you should already have an event in the list that's earlier and will cause a wakeup, normally that's the regular timer interrupt, though in suspend it may (later) also be an RTC event. Since actually getting to this point would be a bug and you can't ever get out again, panic() on it in the time control code. Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/linux/time-internal.h | 4 ++-- arch/um/include/shared/os.h | 2 +- arch/um/kernel/process.c | 11 ++++------- arch/um/kernel/time.c | 12 ++++++++++-- arch/um/os-Linux/time.c | 17 ++++------------- 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h index f3b03d39a854..68e45e950137 100644 --- a/arch/um/include/linux/time-internal.h +++ b/arch/um/include/linux/time-internal.h @@ -28,7 +28,7 @@ struct time_travel_event { extern enum time_travel_mode time_travel_mode; -void time_travel_sleep(unsigned long long duration); +void time_travel_sleep(void); static inline void time_travel_set_event_fn(struct time_travel_event *e, @@ -60,7 +60,7 @@ struct time_travel_event { #define time_travel_mode TT_MODE_OFF -static inline void time_travel_sleep(unsigned long long duration) +static inline void time_travel_sleep(void) { } diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index e2bb7e488d59..0f7fb8bad728 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -256,7 +256,7 @@ extern void os_warn(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); /* time.c */ -extern void os_idle_sleep(unsigned long long nsecs); +extern void os_idle_sleep(void); extern int os_timer_create(void); extern int os_timer_set_interval(unsigned long long nsecs); extern int os_timer_one_shot(unsigned long long nsecs); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 0fcdc374a9a1..a85c48ac2b27 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -205,13 +205,10 @@ void initial_thread_cb(void (*proc)(void *), void *arg) static void um_idle_sleep(void) { - unsigned long long duration = UM_NSEC_PER_SEC; - - if (time_travel_mode != TT_MODE_OFF) { - time_travel_sleep(duration); - } else { - os_idle_sleep(duration); - } + if (time_travel_mode != TT_MODE_OFF) + time_travel_sleep(); + else + os_idle_sleep(); } void arch_cpu_idle(void) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 8dafc3f2add4..8e8eb8ba04a4 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -46,6 +46,9 @@ static void time_travel_set_time(unsigned long long ns) if (unlikely(ns < time_travel_time)) panic("time-travel: time goes backwards %lld -> %lld\n", time_travel_time, ns); + else if (unlikely(ns >= S64_MAX)) + panic("The system was going to sleep forever, aborting"); + time_travel_time = ns; } @@ -399,9 +402,14 @@ static void time_travel_oneshot_timer(struct time_travel_event *e) deliver_alarm(); } -void time_travel_sleep(unsigned long long duration) +void time_travel_sleep(void) { - unsigned long long next = time_travel_time + duration; + /* + * Wait "forever" (using S64_MAX because there are some potential + * wrapping issues, especially with the current TT_MODE_EXTERNAL + * controller application. + */ + unsigned long long next = S64_MAX; if (time_travel_mode == TT_MODE_BASIC) os_timer_disable(); diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index 90f6de224c70..a61cbf73a179 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -99,19 +100,9 @@ long long os_nsecs(void) } /** - * os_idle_sleep() - sleep for a given time of nsecs - * @nsecs: nanoseconds to sleep + * os_idle_sleep() - sleep until interrupted */ -void os_idle_sleep(unsigned long long nsecs) +void os_idle_sleep(void) { - struct timespec ts = { - .tv_sec = nsecs / UM_NSEC_PER_SEC, - .tv_nsec = nsecs % UM_NSEC_PER_SEC - }; - - /* - * Relay the signal if clock_nanosleep is interrupted. - */ - if (clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL)) - deliver_alarm(); + pause(); } From 2701c1bd91dda815b8541aa8c23e1e548cdb6349 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 20:58:05 +0100 Subject: [PATCH 19/32] um: time: Fix read_persistent_clock64() in time-travel In time-travel mode, we've relied on read_persistent_clock64() being called only once at system startup, but this is both the right thing to call from the pseudo-RTC, and also gets called by the timekeeping core during suspend/resume. Thus, fix this to always fall make use of the time_travel_time in any time-travel mode, initializing time_travel_start at boot to the right value depending on the time-travel mode. Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/kernel/time.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 8e8eb8ba04a4..80d33735cfd2 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -676,10 +676,8 @@ void read_persistent_clock64(struct timespec64 *ts) { long long nsecs; - if (time_travel_start_set) + if (time_travel_mode != TT_MODE_OFF) nsecs = time_travel_start + time_travel_time; - else if (time_travel_mode == TT_MODE_EXTERNAL) - nsecs = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1); else nsecs = os_persistent_clock_emulation(); @@ -689,6 +687,25 @@ void read_persistent_clock64(struct timespec64 *ts) void __init time_init(void) { +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + switch (time_travel_mode) { + case TT_MODE_EXTERNAL: + time_travel_start = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1); + /* controller gave us the *current* time, so adjust by that */ + time_travel_ext_get_time(); + time_travel_start -= time_travel_time; + break; + case TT_MODE_INFCPU: + case TT_MODE_BASIC: + if (!time_travel_start_set) + time_travel_start = os_persistent_clock_emulation(); + break; + case TT_MODE_OFF: + /* we just read the host clock with os_persistent_clock_emulation() */ + break; + } +#endif + timer_set_signal_handler(); late_time_init = um_timer_setup; } From 92dcd3d31843fbe1a95d880dc912e1f6beac6632 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 20:58:06 +0100 Subject: [PATCH 20/32] um: Allow PM with suspend-to-idle In order to be able to experiment with suspend in UML, add the minimal work to be able to suspend (s2idle) an instance of UML, and be able to wake it back up from that state with the USR1 signal sent to the main UML process. Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 5 +++++ arch/um/include/shared/kern_util.h | 2 ++ arch/um/include/shared/os.h | 1 + arch/um/kernel/um_arch.c | 25 +++++++++++++++++++++++++ arch/um/os-Linux/signal.c | 14 +++++++++++++- 5 files changed, 46 insertions(+), 1 deletion(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 4b799fad8b48..1c57599b82fa 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -192,3 +192,8 @@ config UML_TIME_TRAVEL_SUPPORT endmenu source "arch/um/drivers/Kconfig" + +config ARCH_SUSPEND_POSSIBLE + def_bool y + +source "kernel/power/Kconfig" diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index ccafb62e8cce..9c08e728a675 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -39,6 +39,8 @@ extern int is_syscall(unsigned long addr); extern void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +extern void uml_pm_wake(void); + extern int start_uml(void); extern void paging_init(void); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 0f7fb8bad728..78250a05394a 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -241,6 +241,7 @@ extern int set_signals(int enable); extern int set_signals_trace(int enable); extern int os_is_signal_stack(void); extern void deliver_alarm(void); +extern void register_pm_wake_signal(void); /* util.c */ extern void stack_protections(unsigned long address); diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 76b37297b7d4..237a8d73a096 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -377,3 +378,27 @@ void *text_poke(void *addr, const void *opcode, size_t len) void text_poke_sync(void) { } + +#ifdef CONFIG_PM_SLEEP +void uml_pm_wake(void) +{ + pm_system_wakeup(); +} + +static int init_pm_wake_signal(void) +{ + /* + * In external time-travel mode we can't use signals to wake up + * since that would mess with the scheduling. We'll have to do + * some additional work to support wakeup on virtio devices or + * similar, perhaps implementing a fake RTC controller that can + * trigger wakeup (and request the appropriate scheduling from + * the external scheduler when going to suspend.) + */ + if (time_travel_mode != TT_MODE_EXTERNAL) + register_pm_wake_signal(); + return 0; +} + +late_initcall(init_pm_wake_signal); +#endif diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index b58bc68cbe64..0a2ea84033b4 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -136,6 +136,16 @@ void set_sigstack(void *sig_stack, int size) panic("enabling signal stack failed, errno = %d\n", errno); } +static void sigusr1_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) +{ + uml_pm_wake(); +} + +void register_pm_wake_signal(void) +{ + set_handler(SIGUSR1); +} + static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { [SIGSEGV] = sig_handler, [SIGBUS] = sig_handler, @@ -145,7 +155,9 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { [SIGIO] = sig_handler, [SIGWINCH] = sig_handler, - [SIGALRM] = timer_alarm_handler + [SIGALRM] = timer_alarm_handler, + + [SIGUSR1] = sigusr1_handler, }; static void hard_handler(int sig, siginfo_t *si, void *p) From a374b7cb1ea648a27ceaa2dea19aa967725e938b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Dec 2020 20:58:07 +0100 Subject: [PATCH 21/32] um: Support suspend to RAM With all the previous bits in place, we can now also support suspend to RAM, in the sense that everything is suspended, not just most, including userspace, processes like in s2idle. Since um_idle_sleep() now waits forever, we can simply call that to "suspend" the system. As before, you can wake it up using SIGUSR1 since we're just in a pause() call that only needs to return. In order to implement selective resume from certain devices, and not have any arbitrary device interrupt wake up, suspend interrupts by removing SIGIO notification (O_ASYNC) from all the FDs that are not supposed to wake up the system. However, swap out the handler so we don't actually handle the SIGIO as an interrupt. Since we're in pause(), the mere act of receiving SIGIO wakes us up, and then after things have been restored enough, re-set O_ASYNC for all previously suspended FDs, reinstall the proper SIGIO handler, and send SIGIO to self to process anything that might now be pending. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/include/shared/kern_util.h | 1 + arch/um/include/shared/os.h | 3 + arch/um/kernel/irq.c | 88 +++++++++++++++++++++++++++++- arch/um/kernel/process.c | 2 +- arch/um/kernel/um_arch.c | 42 ++++++++++++++ arch/um/os-Linux/signal.c | 5 ++ 6 files changed, 139 insertions(+), 2 deletions(-) diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 9c08e728a675..2888ec812f6e 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -68,5 +68,6 @@ extern void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs); extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); extern void fatal_sigsegv(void) __attribute__ ((noreturn)); +void um_idle_sleep(void); #endif diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 78250a05394a..cd750d4edfb5 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -233,6 +233,7 @@ extern void timer_set_signal_handler(void); extern void set_sigstack(void *sig_stack, int size); extern void remove_sigstack(void); extern void set_handler(int sig); +extern void send_sigio_to_self(void); extern int change_sig(int signal, int on); extern void block_signals(void); extern void unblock_signals(void); @@ -307,6 +308,8 @@ extern int os_mod_epoll_fd(int events, int fd, void *data); extern int os_del_epoll_fd(int fd); extern void os_set_ioignore(void); extern void os_close_epoll_fd(void); +extern void um_irqs_suspend(void); +extern void um_irqs_resume(void); /* sigio.c */ extern int add_sigio_fd(int fd); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 482269580b79..ea43312cbfd3 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -20,6 +20,7 @@ #include #include #include +#include extern void free_irqs(void); @@ -36,12 +37,14 @@ struct irq_reg { int events; bool active; bool pending; + bool wakeup; }; struct irq_entry { struct list_head list; int fd; struct irq_reg reg[NUM_IRQ_TYPES]; + bool suspended; }; static DEFINE_SPINLOCK(irq_lock); @@ -70,6 +73,11 @@ static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) } } +void sigio_handler_suspend(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +{ + /* nothing */ +} + void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { struct irq_entry *irq_entry; @@ -365,9 +373,86 @@ error: clear_bit(irq, irqs_allocated); return err; } - EXPORT_SYMBOL(um_request_irq); +#ifdef CONFIG_PM_SLEEP +void um_irqs_suspend(void) +{ + struct irq_entry *entry; + unsigned long flags; + + sig_info[SIGIO] = sigio_handler_suspend; + + spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + bool wake = false; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + if (!entry->reg[t].events) + continue; + + if (entry->reg[t].wakeup) { + wake = true; + break; + } + } + + if (!wake) { + entry->suspended = true; + os_clear_fd_async(entry->fd); + } + } + spin_unlock_irqrestore(&irq_lock, flags); +} + +void um_irqs_resume(void) +{ + struct irq_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + if (entry->suspended) { + int err = os_set_fd_async(entry->fd); + + WARN(err < 0, "os_set_fd_async returned %d\n", err); + entry->suspended = false; + } + } + spin_unlock_irqrestore(&irq_lock, flags); + + sig_info[SIGIO] = sigio_handler; + send_sigio_to_self(); +} + +static int normal_irq_set_wake(struct irq_data *d, unsigned int on) +{ + struct irq_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + if (!entry->reg[t].events) + continue; + + if (entry->reg[t].irq != d->irq) + continue; + entry->reg[t].wakeup = on; + goto unlock; + } + } +unlock: + spin_unlock_irqrestore(&irq_lock, flags); + return 0; +} +#else +#define normal_irq_set_wake NULL +#endif + /* * irq_chip must define at least enable/disable and ack when * the edge handler is used. @@ -384,6 +469,7 @@ static struct irq_chip normal_irq_type = { .irq_ack = dummy, .irq_mask = dummy, .irq_unmask = dummy, + .irq_set_wake = normal_irq_set_wake, }; static struct irq_chip alarm_irq_type = { diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index a85c48ac2b27..81d508daf67c 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -203,7 +203,7 @@ void initial_thread_cb(void (*proc)(void *), void *arg) kmalloc_ok = save_kmalloc_ok; } -static void um_idle_sleep(void) +void um_idle_sleep(void) { if (time_travel_mode != TT_MODE_OFF) time_travel_sleep(); diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 237a8d73a096..9c7e6d7ea1b3 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -385,6 +385,45 @@ void uml_pm_wake(void) pm_system_wakeup(); } +static int um_suspend_valid(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} + +static int um_suspend_prepare(void) +{ + um_irqs_suspend(); + return 0; +} + +static int um_suspend_enter(suspend_state_t state) +{ + if (WARN_ON(state != PM_SUSPEND_MEM)) + return -EINVAL; + + /* + * This is identical to the idle sleep, but we've just + * (during suspend) turned off all interrupt sources + * except for the ones we want, so now we can only wake + * up on something we actually want to wake up on. All + * timing has also been suspended. + */ + um_idle_sleep(); + return 0; +} + +static void um_suspend_finish(void) +{ + um_irqs_resume(); +} + +const struct platform_suspend_ops um_suspend_ops = { + .valid = um_suspend_valid, + .prepare = um_suspend_prepare, + .enter = um_suspend_enter, + .finish = um_suspend_finish, +}; + static int init_pm_wake_signal(void) { /* @@ -397,6 +436,9 @@ static int init_pm_wake_signal(void) */ if (time_travel_mode != TT_MODE_EXTERNAL) register_pm_wake_signal(); + + suspend_set_ops(&um_suspend_ops); + return 0; } diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 0a2ea84033b4..510e956b4320 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -234,6 +234,11 @@ void set_handler(int sig) panic("sigprocmask failed - errno = %d\n", errno); } +void send_sigio_to_self(void) +{ + kill(os_getpid(), SIGIO); +} + int change_sig(int signal, int on) { sigset_t sigset; From 3c6ac61bc91ea39031f020c973a91db0aee10fde Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Tue, 10 Nov 2020 13:02:21 +0000 Subject: [PATCH 22/32] um: Fetch registers only for signals which need them UML userspace fetches siginfo and passes it to signal handlers in UML. This is needed only for some of the signals, because key handlers like SIGIO make no use of this variable. Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/os-Linux/skas/process.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 4fb877b99dde..0621d521208e 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -400,7 +400,20 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) if (WIFSTOPPED(status)) { int sig = WSTOPSIG(status); - ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); + /* These signal handlers need the si argument. + * The SIGIO and SIGALARM handlers which constitute the + * majority of invocations, do not use it. + */ + switch (sig) { + case SIGSEGV: + case SIGTRAP: + case SIGILL: + case SIGBUS: + case SIGFPE: + case SIGWINCH: + ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); + break; + } switch (sig) { case SIGSEGV: From 58b09f68697066dfde948153c82dd5d85e10f127 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 4 Dec 2020 12:34:34 +0100 Subject: [PATCH 23/32] um: time-travel: avoid multiple identical propagations If there is some kind of interrupt negotation or such then it may happen that we send an update message multiple times, avoid that in the interest of efficiency by storing the last transmitted value and only sending a new update if it's not the same as the last update. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/time.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 80d33735cfd2..204ddb141b01 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -190,7 +190,13 @@ static void time_travel_ext_update_request(unsigned long long time) void __time_travel_propagate_time(void) { + static unsigned long long last_propagated; + + if (last_propagated == time_travel_time) + return; + time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time); + last_propagated = time_travel_time; } EXPORT_SYMBOL_GPL(__time_travel_propagate_time); From 963285b0b47a1b8e1dfa5481717855a7057ccec6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 5 Dec 2020 21:50:17 +0100 Subject: [PATCH 24/32] um: support some of ARCH_HAS_SET_MEMORY For now, only support set_memory_ro()/rw() which we need for the stack protection in the next patch. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 1 + arch/um/include/asm/pgtable.h | 3 ++ arch/um/include/asm/set_memory.h | 1 + arch/um/kernel/tlb.c | 54 ++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+) create mode 100644 arch/um/include/asm/set_memory.h diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 1c57599b82fa..70ee19cc6ec6 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -15,6 +15,7 @@ config UML select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_BUGVERBOSE select NO_DMA + select ARCH_HAS_SET_MEMORY select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index def376194dce..39376bb63abf 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -55,12 +55,15 @@ extern unsigned long end_iomem; #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) #define __PAGE_KERNEL_EXEC \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) +#define __PAGE_KERNEL_RO \ + (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) /* * The i386 can't do page protection for execute, and considers that the same diff --git a/arch/um/include/asm/set_memory.h b/arch/um/include/asm/set_memory.h new file mode 100644 index 000000000000..24266c63720d --- /dev/null +++ b/arch/um/include/asm/set_memory.h @@ -0,0 +1 @@ +#include diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 61776790cd67..437d1f1cc5ec 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -608,3 +608,57 @@ void force_flush_all(void) vma = vma->vm_next; } } + +struct page_change_data { + unsigned int set_mask, clear_mask; +}; + +static int change_page_range(pte_t *ptep, unsigned long addr, void *data) +{ + struct page_change_data *cdata = data; + pte_t pte = READ_ONCE(*ptep); + + pte_clear_bits(pte, cdata->clear_mask); + pte_set_bits(pte, cdata->set_mask); + + set_pte(ptep, pte); + return 0; +} + +static int change_memory(unsigned long start, unsigned long pages, + unsigned int set_mask, unsigned int clear_mask) +{ + unsigned long size = pages * PAGE_SIZE; + struct page_change_data data; + int ret; + + data.set_mask = set_mask; + data.clear_mask = clear_mask; + + ret = apply_to_page_range(&init_mm, start, size, change_page_range, + &data); + + flush_tlb_kernel_range(start, start + size); + + return ret; +} + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_memory(addr, numpages, 0, _PAGE_RW); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_memory(addr, numpages, _PAGE_RW, 0); +} + +int set_memory_nx(unsigned long addr, int numpages) +{ + return -EOPNOTSUPP; +} + +int set_memory_x(unsigned long addr, int numpages) +{ + return -EOPNOTSUPP; +} From ef4459a6da0955b533ebfc97a7d756ac090f50c9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 5 Dec 2020 21:50:18 +0100 Subject: [PATCH 25/32] um: allocate a guard page to helper threads We've been running into stack overflows in helper threads corrupting memory (e.g. because somebody put printf() or os_info() there), so to avoid those causing hard-to-debug issues later on, allocate a guard page for helper thread stacks and mark it read-only. Unfortunately, the crash dump at that point is useless as the stack tracer will try to backtrace the *kernel* thread, not the helper thread, but at least we don't survive to a random issue caused by corruption. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/ubd_kern.c | 2 +- arch/um/include/shared/kern_util.h | 2 +- arch/um/kernel/process.c | 11 +++++++---- arch/um/os-Linux/helper.c | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 7e07b321e847..13b1fe694b90 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1241,7 +1241,7 @@ static int __init ubd_driver_init(void){ /* Letting ubd=sync be like using ubd#s= instead of ubd#= is * enough. So use anyway the io thread. */ } - stack = alloc_stack(0, 0); + stack = alloc_stack(0); io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), &thread_fd); if(io_pid < 0){ diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 2888ec812f6e..d8c279e3312f 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -19,7 +19,7 @@ extern int kmalloc_ok; #define UML_ROUND_UP(addr) \ ((((unsigned long) addr) + PAGE_SIZE - 1) & PAGE_MASK) -extern unsigned long alloc_stack(int order, int atomic); +extern unsigned long alloc_stack(int atomic); extern void free_stack(unsigned long stack, int order); struct pt_regs; diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 81d508daf67c..2a986ece5478 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include /* * This is a per-cpu array. A processor only modifies its entry and it only @@ -62,16 +63,18 @@ void free_stack(unsigned long stack, int order) free_pages(stack, order); } -unsigned long alloc_stack(int order, int atomic) +unsigned long alloc_stack(int atomic) { - unsigned long page; + unsigned long addr; gfp_t flags = GFP_KERNEL; if (atomic) flags = GFP_ATOMIC; - page = __get_free_pages(flags, order); + addr = __get_free_pages(flags, 1); - return page; + set_memory_ro(addr, 1); + + return addr + PAGE_SIZE; } static inline void set_current(struct task_struct *task) diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c index 9fa6e4187d4f..feb48d796e00 100644 --- a/arch/um/os-Linux/helper.c +++ b/arch/um/os-Linux/helper.c @@ -45,7 +45,7 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) unsigned long stack, sp; int pid, fds[2], ret, n; - stack = alloc_stack(0, __cant_sleep()); + stack = alloc_stack(__cant_sleep()); if (stack == 0) return -ENOMEM; @@ -116,7 +116,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long stack, sp; int pid, status, err; - stack = alloc_stack(0, __cant_sleep()); + stack = alloc_stack(__cant_sleep()); if (stack == 0) return -ENOMEM; From e3a01cbee9c5f2c6fc813dd6af007716e60257e7 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Mon, 7 Dec 2020 17:19:38 +0000 Subject: [PATCH 26/32] um: Monitor error events in IRQ controller Ensure that file closes, connection closes, etc are propagated as interrupts in the interrupt controller. Fixes: ff6a17989c08 ("Epoll based IRQ controller") Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/os-Linux/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c index aa90a05b3d78..98ea910ef87c 100644 --- a/arch/um/os-Linux/irq.c +++ b/arch/um/os-Linux/irq.c @@ -48,7 +48,7 @@ int os_epoll_triggered(int index, int events) int os_event_mask(enum um_irq_type irq_type) { if (irq_type == IRQ_READ) - return EPOLLIN | EPOLLPRI; + return EPOLLIN | EPOLLPRI | EPOLLERR | EPOLLHUP | EPOLLRDHUP; if (irq_type == IRQ_WRITE) return EPOLLOUT; return 0; From 9b1c0c0e25dcccafd30e7d4c150c249cc65550eb Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Mon, 7 Dec 2020 17:19:39 +0000 Subject: [PATCH 27/32] um: tty: Fix handling of close in tty lines Fix a logical error in tty reading. We get 0 and errno == EAGAIN on the first attempt to read from a closed file descriptor. Compared to that a true EAGAIN is EAGAIN and -1. If we check errno for EAGAIN first, before checking the return value we miss the fact that the descriptor is closed. This bug is as old as the driver. It was not showing up with the original POLL based IRQ controller, because it was producing multiple events. Switching to EPOLL unmasked it. Fixes: ff6a17989c08 ("Epoll based IRQ controller") Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/chan_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index 4d80526a4236..d8845d4aac6a 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -26,10 +26,10 @@ int generic_read(int fd, char *c_out, void *unused) n = read(fd, c_out, sizeof(*c_out)); if (n > 0) return n; - else if (errno == EAGAIN) - return 0; else if (n == 0) return -EIO; + else if (errno == EAGAIN) + return 0; return -errno; } From 9431f7c199ab0d02da1482d62255e0b4621cb1b5 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Mon, 7 Dec 2020 17:19:40 +0000 Subject: [PATCH 28/32] um: chan_xterm: Fix fd leak xterm serial channel was leaking a fd used in setting up the port helper This bug is prehistoric - it predates switching to git. The "fixes" header here is really just to mark all the versions we would like this to apply to which is "Anything from the Cretaceous period onwards". No dinosaurs were harmed in fixing this bug. Fixes: b40997b872cd ("um: drivers/xterm.c: fix a file descriptor leak") Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/xterm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c index fc7f1e746703..87ca4a47cd66 100644 --- a/arch/um/drivers/xterm.c +++ b/arch/um/drivers/xterm.c @@ -18,6 +18,7 @@ struct xterm_chan { int pid; int helper_pid; + int chan_fd; char *title; int device; int raw; @@ -33,6 +34,7 @@ static void *xterm_init(char *str, int device, const struct chan_opts *opts) return NULL; *data = ((struct xterm_chan) { .pid = -1, .helper_pid = -1, + .chan_fd = -1, .device = device, .title = opts->xterm_title, .raw = opts->raw } ); @@ -149,6 +151,7 @@ static int xterm_open(int input, int output, int primary, void *d, goto out_kill; } + data->chan_fd = fd; new = xterm_fd(fd, &data->helper_pid); if (new < 0) { err = new; @@ -206,6 +209,8 @@ static void xterm_close(int fd, void *d) os_kill_process(data->helper_pid, 0); data->helper_pid = -1; + if (data->chan_fd != -1) + os_close_file(data->chan_fd); os_close_file(fd); } From 452f94cecff692a76eaaa9330fca03fe0f204f6f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 11 Dec 2020 09:01:14 +0100 Subject: [PATCH 29/32] um: time-travel: Actually apply "free-until" optimisation Due a bug - we never checked the time_travel_ext_free_until value - we were always requesting time for every single scheduling. This adds up since we make reading time cost 256ns, and it's a fairly common call. Fix this. While at it, also make reading time only cost something when we're not currently waiting for our scheduling turn - otherwise things get mixed up in a very confusing way. We should never get here, since we're not actually running, but it's possible if you stick printk() or such into the virtio code that must handle the external interrupts. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/time.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 204ddb141b01..e48282308126 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -183,6 +183,14 @@ static void time_travel_ext_update_request(unsigned long long time) time == time_travel_ext_prev_request) return; + /* + * if we're running and are allowed to run past the request + * then we don't need to update it either + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && + time < time_travel_ext_free_until) + return; + time_travel_ext_prev_request = time; time_travel_ext_prev_request_valid = true; time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); @@ -223,6 +231,7 @@ static void time_travel_ext_wait(bool idle) }; time_travel_ext_prev_request_valid = false; + time_travel_ext_free_until_valid = false; time_travel_ext_waiting++; time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); @@ -492,6 +501,7 @@ invalid_number: #define time_travel_start_set 0 #define time_travel_start 0 #define time_travel_time 0 +#define time_travel_ext_waiting 0 static inline void time_travel_update_time(unsigned long long ns, bool retearly) { @@ -637,7 +647,8 @@ static u64 timer_read(struct clocksource *cs) * "what do I do next" and onstack event we use to know when * to return from time_travel_update_time(). */ - if (!irqs_disabled() && !in_interrupt() && !in_softirq()) + if (!irqs_disabled() && !in_interrupt() && !in_softirq() && + !time_travel_ext_waiting) time_travel_update_time(time_travel_time + TIMER_MULTIPLIER, false); From cae20ba0a16cdb2c6d218ea3519bb0942f287b69 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 11 Dec 2020 10:56:07 +0100 Subject: [PATCH 30/32] um: irq/sigio: Support suspend/resume handling of workaround IRQs If the sigio workaround needed to be applied to a file descriptor, set_irq_wake() wouldn't work for it since it would get polled by the thread instead of causing SIGIO, and thus could never really cause a wakeup, since the thread notification FD wasn't marked as being able to wake up the system. Fix this by marking the thread's notification FD explicitly as a wake source FD, i.e. not suppressing SIGIO for it in suspend. In order to not cause spurious wakeups, we then need to remove all FDs that shouldn't wake up the system from the polling thread. In order to do this, add unlocked versions of ignore_sigio_fd() and add_sigio_fd() (nothing else is happening in suspend, so this is fine), and also modify ignore_sigio_fd() to return -ENOENT if the FD wasn't originally in there. This doesn't matter because nothing else currently checks the return value, but the irq code needs to know which ones to restore the workaround for. All told, this lets us use a timerfd for the RTC clock in the next patch, which doesn't send SIGIO. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/include/shared/os.h | 8 +++++++ arch/um/kernel/irq.c | 17 +++++++++++++- arch/um/os-Linux/sigio.c | 47 +++++++++++++++++++++++++------------ 3 files changed, 56 insertions(+), 16 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index cd750d4edfb5..bc25c960f7e0 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -316,6 +316,14 @@ extern int add_sigio_fd(int fd); extern int ignore_sigio_fd(int fd); extern void maybe_sigio_broken(int fd); extern void sigio_broken(int fd); +/* + * unlocked versions for IRQ controller code. + * + * This is safe because it's used at suspend/resume and nothing + * else is running. + */ +extern int __add_sigio_fd(int fd); +extern int __ignore_sigio_fd(int fd); /* prctl.c */ extern int os_arch_prctl(int pid, int option, unsigned long *arg2); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index ea43312cbfd3..3741d2380060 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -45,6 +45,7 @@ struct irq_entry { int fd; struct irq_reg reg[NUM_IRQ_TYPES]; bool suspended; + bool sigio_workaround; }; static DEFINE_SPINLOCK(irq_lock); @@ -392,7 +393,14 @@ void um_irqs_suspend(void) if (!entry->reg[t].events) continue; - if (entry->reg[t].wakeup) { + /* + * For the SIGIO_WRITE_IRQ, which is used to handle the + * SIGIO workaround thread, we need special handling: + * enable wake for it itself, but below we tell it about + * any FDs that should be suspended. + */ + if (entry->reg[t].wakeup || + entry->reg[t].irq == SIGIO_WRITE_IRQ) { wake = true; break; } @@ -401,6 +409,8 @@ void um_irqs_suspend(void) if (!wake) { entry->suspended = true; os_clear_fd_async(entry->fd); + entry->sigio_workaround = + !__ignore_sigio_fd(entry->fd); } } spin_unlock_irqrestore(&irq_lock, flags); @@ -418,6 +428,11 @@ void um_irqs_resume(void) WARN(err < 0, "os_set_fd_async returned %d\n", err); entry->suspended = false; + + if (entry->sigio_workaround) { + err = __add_sigio_fd(entry->fd); + WARN(err < 0, "add_sigio_returned %d\n", err); + } } } spin_unlock_irqrestore(&irq_lock, flags); diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 79cd6d6d6211..6597ea1986ff 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -164,47 +164,55 @@ static void update_thread(void) set_signals_trace(flags); } -int add_sigio_fd(int fd) +int __add_sigio_fd(int fd) { struct pollfd *p; int err, i, n; - sigio_lock(); for (i = 0; i < all_sigio_fds.used; i++) { if (all_sigio_fds.poll[i].fd == fd) break; } - if (i == all_sigio_fds.used) { - err = -ENOSPC; - goto out; - } + if (i == all_sigio_fds.used) + return -ENOSPC; p = &all_sigio_fds.poll[i]; for (i = 0; i < current_poll.used; i++) { if (current_poll.poll[i].fd == fd) - goto out; + return 0; } n = current_poll.used; err = need_poll(&next_poll, n + 1); if (err) - goto out; + return err; memcpy(next_poll.poll, current_poll.poll, current_poll.used * sizeof(struct pollfd)); next_poll.poll[n] = *p; next_poll.used = n + 1; update_thread(); - out: + + return 0; +} + + +int add_sigio_fd(int fd) +{ + int err; + + sigio_lock(); + err = __add_sigio_fd(fd); sigio_unlock(); + return err; } -int ignore_sigio_fd(int fd) +int __ignore_sigio_fd(int fd) { struct pollfd *p; - int err = 0, i, n = 0; + int err, i, n = 0; /* * This is called from exitcalls elsewhere in UML - if @@ -214,17 +222,16 @@ int ignore_sigio_fd(int fd) if (write_sigio_pid == -1) return -EIO; - sigio_lock(); for (i = 0; i < current_poll.used; i++) { if (current_poll.poll[i].fd == fd) break; } if (i == current_poll.used) - goto out; + return -ENOENT; err = need_poll(&next_poll, current_poll.used - 1); if (err) - goto out; + return err; for (i = 0; i < current_poll.used; i++) { p = ¤t_poll.poll[i]; @@ -234,8 +241,18 @@ int ignore_sigio_fd(int fd) next_poll.used = current_poll.used - 1; update_thread(); - out: + + return 0; +} + +int ignore_sigio_fd(int fd) +{ + int err; + + sigio_lock(); + err = __ignore_sigio_fd(fd); sigio_unlock(); + return err; } From 11385539c024b6071dce538123a2043a8f52c9a1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 13 Dec 2020 22:18:18 +0100 Subject: [PATCH 31/32] um: time-travel: Correct time event IRQ delivery Lockdep (on 5.10-rc) points out that we're delivering IRQs while IRQs are not even enabled, which clearly shouldn't happen. Defer the time event IRQ delivery until they actually are enabled. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/include/shared/common-offsets.h | 3 ++ arch/um/include/shared/os.h | 3 ++ arch/um/kernel/time.c | 38 +++++++++++++++++++++++++ arch/um/os-Linux/signal.c | 3 ++ 4 files changed, 47 insertions(+) diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 4e99fe05576a..16a51a8c800f 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -40,3 +40,6 @@ DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); #ifdef CONFIG_64BIT DEFINE(UML_CONFIG_64BIT, CONFIG_64BIT); #endif +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +DEFINE(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT, CONFIG_UML_TIME_TRAVEL_SUPPORT); +#endif diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index bc25c960f7e0..13d86f94cf0f 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -342,4 +342,7 @@ extern void unblock_signals_trace(void); extern void um_trace_signals_on(void); extern void um_trace_signals_off(void); +/* time-travel */ +extern void deliver_time_travel_irqs(void); + #endif diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index e48282308126..f4db89b5b5a6 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -31,6 +31,7 @@ static bool time_travel_start_set; static unsigned long long time_travel_start; static unsigned long long time_travel_time; static LIST_HEAD(time_travel_events); +static LIST_HEAD(time_travel_irqs); static unsigned long long time_travel_timer_interval; static unsigned long long time_travel_next_event; static struct time_travel_event time_travel_timer_event; @@ -324,6 +325,35 @@ void time_travel_periodic_timer(struct time_travel_event *e) deliver_alarm(); } +void deliver_time_travel_irqs(void) +{ + struct time_travel_event *e; + unsigned long flags; + + /* + * Don't do anything for most cases. Note that because here we have + * to disable IRQs (and re-enable later) we'll actually recurse at + * the end of the function, so this is strictly necessary. + */ + if (likely(list_empty(&time_travel_irqs))) + return; + + local_irq_save(flags); + irq_enter(); + while ((e = list_first_entry_or_null(&time_travel_irqs, + struct time_travel_event, + list))) { + WARN(e->time != time_travel_time, + "time moved from %lld to %lld before IRQ delivery\n", + time_travel_time, e->time); + list_del(&e->list); + e->pending = false; + e->fn(e); + } + irq_exit(); + local_irq_restore(flags); +} + static void time_travel_deliver_event(struct time_travel_event *e) { if (e == &time_travel_timer_event) { @@ -332,6 +362,14 @@ static void time_travel_deliver_event(struct time_travel_event *e) * by itself, so must handle it specially here */ e->fn(e); + } else if (irqs_disabled()) { + list_add_tail(&e->list, &time_travel_irqs); + /* + * set pending again, it was set to false when the + * event was deleted from the original list, but + * now it's still pending until we deliver the IRQ. + */ + e->pending = true; } else { unsigned long flags; diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 510e956b4320..96f511d1aabe 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -271,6 +271,9 @@ void unblock_signals(void) return; signals_enabled = 1; +#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT + deliver_time_travel_irqs(); +#endif /* * We loop because the IRQ handler returns with interrupts off. So, From 1fb1abc83636f5329c26cd29f0f19f3faeb697a5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 14 Dec 2020 20:51:02 +0100 Subject: [PATCH 32/32] um: Fix build w/o CONFIG_PM_SLEEP uml_pm_wake() is unconditionally called from the SIGUSR1 wakeup handler since that's in the userspace portion of UML, and thus a bit tricky to ifdef out. Since pm_system_wakeup() can always be called (but may be an empty inline), also simply always have uml_pm_wake() to fix the build. Reported-by: Randy Dunlap Acked-by: Randy Dunlap # build-tested Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/um_arch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 9c7e6d7ea1b3..31d356b1ffd8 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -379,12 +379,12 @@ void text_poke_sync(void) { } -#ifdef CONFIG_PM_SLEEP void uml_pm_wake(void) { pm_system_wakeup(); } +#ifdef CONFIG_PM_SLEEP static int um_suspend_valid(suspend_state_t state) { return state == PM_SUSPEND_MEM;