diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst index 52ea7b6b2fe8..7964fe134277 100644 --- a/Documentation/cdrom/cdrom-standard.rst +++ b/Documentation/cdrom/cdrom-standard.rst @@ -218,7 +218,6 @@ current *struct* is:: int (*tray_move)(struct cdrom_device_info *, int); int (*lock_door)(struct cdrom_device_info *, int); int (*select_speed)(struct cdrom_device_info *, int); - int (*select_disc)(struct cdrom_device_info *, int); int (*get_last_session) (struct cdrom_device_info *, struct cdrom_multisession *); int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *); @@ -419,15 +418,6 @@ this `auto-selection` capability, the decision should be made on the current disc loaded and the return value should be positive. A negative return value indicates an error. -:: - - int select_disc(struct cdrom_device_info *cdi, int number) - -If the drive can store multiple discs (a juke-box) this function -will perform disc selection. It should return the number of the -selected disc on success, a negative value on error. Currently, only -the ide-cd driver supports this functionality. - :: int get_last_session(struct cdrom_device_info *cdi, diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 061744c436d9..6a0dd99786f9 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1183,85 +1183,7 @@ Provides counts of softirq handlers serviced since boot time, for each CPU. HRTIMER: 0 0 0 0 RCU: 1678 1769 2178 2250 - -1.3 IDE devices in /proc/ide ----------------------------- - -The subdirectory /proc/ide contains information about all IDE devices of which -the kernel is aware. There is one subdirectory for each IDE controller, the -file drivers and a link for each IDE device, pointing to the device directory -in the controller specific subtree. - -The file 'drivers' contains general information about the drivers used for the -IDE devices:: - - > cat /proc/ide/drivers - ide-cdrom version 4.53 - ide-disk version 1.08 - -More detailed information can be found in the controller specific -subdirectories. These are named ide0, ide1 and so on. Each of these -directories contains the files shown in table 1-6. - - -.. table:: Table 1-6: IDE controller info in /proc/ide/ide? - - ======= ======================================= - File Content - ======= ======================================= - channel IDE channel (0 or 1) - config Configuration (only for PCI/IDE bridge) - mate Mate name - model Type/Chipset of IDE controller - ======= ======================================= - -Each device connected to a controller has a separate subdirectory in the -controllers directory. The files listed in table 1-7 are contained in these -directories. - - -.. table:: Table 1-7: IDE device information - - ================ ========================================== - File Content - ================ ========================================== - cache The cache - capacity Capacity of the medium (in 512Byte blocks) - driver driver and version - geometry physical and logical geometry - identify device identify block - media media type - model device identifier - settings device setup - smart_thresholds IDE disk management thresholds - smart_values IDE disk management values - ================ ========================================== - -The most interesting file is ``settings``. This file contains a nice -overview of the drive parameters:: - - # cat /proc/ide/ide0/hda/settings - name value min max mode - ---- ----- --- --- ---- - bios_cyl 526 0 65535 rw - bios_head 255 0 255 rw - bios_sect 63 0 63 rw - breada_readahead 4 0 127 rw - bswap 0 0 1 r - file_readahead 72 0 2097151 rw - io_32bit 0 0 3 rw - keepsettings 0 0 1 rw - max_kb_per_request 122 1 127 rw - multcount 0 0 8 rw - nice1 1 0 1 rw - nowerr 0 0 1 rw - pio_mode write-only 0 255 w - slow 0 0 1 rw - unmaskirq 0 0 1 rw - using_dma 0 0 1 rw - - -1.4 Networking info in /proc/net +1.3 Networking info in /proc/net -------------------------------- The subdirectory /proc/net follows the usual pattern. Table 1-8 shows the @@ -1340,7 +1262,7 @@ It will contain information that is specific to that bond, such as the current slaves of the bond, the link status of the slaves, and how many times the slaves link has failed. -1.5 SCSI info +1.4 SCSI info ------------- If you have a SCSI host adapter in your system, you'll find a subdirectory @@ -1403,7 +1325,7 @@ AHA-2940 SCSI adapter:: Total transfers 0 (0 reads and 0 writes) -1.6 Parallel port info in /proc/parport +1.5 Parallel port info in /proc/parport --------------------------------------- The directory /proc/parport contains information about the parallel ports of @@ -1428,7 +1350,7 @@ These directories contain the four files shown in Table 1-10. number or none). ========= ==================================================================== -1.7 TTY info in /proc/tty +1.6 TTY info in /proc/tty ------------------------- Information about the available and actually used tty's can be found in the @@ -1463,7 +1385,7 @@ To see which tty's are currently in use, you can simply look into the file unknown /dev/tty 4 1-63 console -1.8 Miscellaneous kernel statistics in /proc/stat +1.7 Miscellaneous kernel statistics in /proc/stat ------------------------------------------------- Various pieces of information about kernel activity are available in the @@ -1536,7 +1458,7 @@ softirqs serviced; each subsequent column is the total for that particular softirq. -1.9 Ext4 file system parameters +1.8 Ext4 file system parameters ------------------------------- Information about mounted ext4 file systems can be found in @@ -1552,7 +1474,7 @@ in Table 1-12, below. mb_groups details of multiblock allocator buddy cache of free blocks ============== ========================================================== -1.10 /proc/consoles +1.9 /proc/consoles ------------------- Shows registered system console lines. diff --git a/Documentation/userspace-api/ioctl/cdrom.rst b/Documentation/userspace-api/ioctl/cdrom.rst index 682948fc88a3..2ad91dbebd7c 100644 --- a/Documentation/userspace-api/ioctl/cdrom.rst +++ b/Documentation/userspace-api/ioctl/cdrom.rst @@ -718,6 +718,9 @@ CDROMPLAYBLK CDROMGETSPINDOWN + Obsolete, was ide-cd only + + usage:: char spindown; @@ -736,6 +739,9 @@ CDROMGETSPINDOWN CDROMSETSPINDOWN + Obsolete, was ide-cd only + + usage:: char spindown diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index b03269faef71..c4344b67628d 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -483,7 +483,6 @@ static void ubd_handler(void) if ((io_req->error == BLK_STS_NOTSUPP) && (req_op(io_req->req) == REQ_OP_DISCARD)) { blk_queue_max_discard_sectors(io_req->req->q, 0); blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q); } blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); @@ -800,10 +799,8 @@ static int ubd_open_dev(struct ubd *ubd_dev) } if (ubd_dev->no_trim == 0) { ubd_dev->queue->limits.discard_granularity = SECTOR_SIZE; - ubd_dev->queue->limits.discard_alignment = SECTOR_SIZE; blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, ubd_dev->queue); } blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue); return 0; diff --git a/block/Makefile b/block/Makefile index 3950ecbc5c26..4e01bb71ad6e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o +obj-$(CONFIG_BLK_CGROUP_FC_APPID) += blk-cgroup-fc-appid.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o diff --git a/block/badblocks.c b/block/badblocks.c index d39056630d9c..3afb550c0f7b 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -65,7 +65,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors, s >>= bb->shift; target += (1<shift) - 1; target >>= bb->shift; - sectors = target - s; } /* 'target' is now the first block after the bad range */ @@ -345,7 +344,6 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) s += (1<shift) - 1; s >>= bb->shift; target >>= bb->shift; - sectors = target - s; } write_seqlock_irq(&bb->lock); diff --git a/block/bdev.c b/block/bdev.c index 13de871fa816..7bf88e591aaf 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -673,17 +673,17 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) + if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); - bdev->bd_openers++; + atomic_inc(&bdev->bd_openers); return 0; } static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) { - if (!--bdev->bd_openers) + if (atomic_dec_and_test(&bdev->bd_openers)) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) bdev->bd_disk->fops->release(bdev->bd_disk, mode); @@ -694,7 +694,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) struct gendisk *disk = part->bd_disk; int ret; - if (part->bd_openers) + if (atomic_read(&part->bd_openers)) goto done; ret = blkdev_get_whole(bdev_whole(part), mode); @@ -708,7 +708,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); done: - part->bd_openers++; + atomic_inc(&part->bd_openers); return 0; out_blkdev_put: @@ -720,7 +720,7 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) { struct block_device *whole = bdev_whole(part); - if (--part->bd_openers) + if (!atomic_dec_and_test(&part->bd_openers)) return; blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; @@ -899,7 +899,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * of the world and we want to avoid long (could be several minute) * syncs while holding the mutex. */ - if (bdev->bd_openers == 1) + if (atomic_read(&bdev->bd_openers) == 1) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); @@ -1044,7 +1044,7 @@ void sync_bdevs(bool wait) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (!bdev->bd_openers) { + if (!atomic_read(&bdev->bd_openers)) { ; /* skip */ } else if (wait) { /* diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 420eda2589c0..09574af83566 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -557,6 +557,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; + bfqg->online = true; bfqg->rq_pos_tree = RB_ROOT; } @@ -585,28 +586,11 @@ static void bfq_group_set_parent(struct bfq_group *bfqg, entity->sched_data = &parent->sched_data; } -static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, - struct blkcg *blkcg) +static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { - struct blkcg_gq *blkg; - - blkg = blkg_lookup(blkcg, bfqd->queue); - if (likely(blkg)) - return blkg_to_bfqg(blkg); - return NULL; -} - -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - struct bfq_group *bfqg, *parent; + struct bfq_group *parent; struct bfq_entity *entity; - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - - if (unlikely(!bfqg)) - return NULL; - /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet @@ -623,8 +607,24 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, bfq_group_set_parent(curr_bfqg, parent); } } +} - return bfqg; +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct bfq_group *bfqg; + + while (blkg) { + bfqg = blkg_to_bfqg(blkg); + if (bfqg->online) { + bio_associate_blkg_from_css(bio, &blkg->blkcg->css); + return bfqg; + } + blkg = blkg->parent; + } + bio_associate_blkg_from_css(bio, + &bfqg_to_blkg(bfqd->root_group)->blkcg->css); + return bfqd->root_group; } /** @@ -714,25 +714,15 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup - * time here, at the price of slightly more complex code. */ -static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct blkcg *blkcg) +static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_group *bfqg; struct bfq_entity *entity; - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) - bfqg = bfqd->root_group; - if (async_bfqq) { entity = &async_bfqq->entity; @@ -743,9 +733,39 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, } if (sync_bfqq) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { + /* We are the only user of this bfqq, just move it */ + if (sync_bfqq->entity.sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + } else { + struct bfq_queue *bfqq; + + /* + * The queue was merged to a different queue. Check + * that the merge chain still belongs to the same + * cgroup. + */ + for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) + if (bfqq->entity.sched_data != + &bfqg->sched_data) + break; + if (bfqq) { + /* + * Some queue changed cgroup so the merge is + * not valid anymore. We cannot easily just + * cancel the merge (by clearing new_bfqq) as + * there may be other processes using this + * queue and holding refs to all queues below + * sync_bfqq->new_bfqq. Similarly if the merge + * already happened, we need to detach from + * bfqq now so that we cannot merge bio to a + * request from the old cgroup. + */ + bfq_put_cooperator(sync_bfqq); + bfq_release_process_ref(bfqd, sync_bfqq); + bic_set_bfqq(bic, NULL, 1); + } + } } return bfqg; @@ -754,20 +774,24 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_group *bfqg = NULL; + struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; - rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) - goto out; + return; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); + /* + * New cgroup for this process. Make sure it is linked to bfq internal + * cgroup hierarchy. + */ + bfq_link_bfqg(bfqd, bfqg); + __bfq_bic_change_cgroup(bfqd, bic, bfqg); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following @@ -820,8 +844,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) */ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; -out: - rcu_read_unlock(); } /** @@ -949,6 +971,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) put_async_queues: bfq_put_async_queues(bfqd, bfqg); + bfqg->online = false; spin_unlock_irqrestore(&bfqd->lock, flags); /* @@ -1438,7 +1461,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg) +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1f62dbdc521f..0d46cb728bbf 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -374,7 +374,7 @@ static const unsigned long bfq_activation_stable_merging = 600; */ static const unsigned long bfq_late_stable_merging = 600; -#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) +#define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) @@ -456,6 +456,8 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) */ void bfq_schedule_dispatch(struct bfq_data *bfqd) { + lockdep_assert_held(&bfqd->lock); + if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); blk_mq_run_hw_queues(bfqd->queue, true); @@ -2133,9 +2135,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || - bfqq->dispatched > 0 || - now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || - bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC) return; /* @@ -2208,9 +2208,13 @@ static void bfq_add_request(struct request *rq) bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued + 1); - if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { + if (bfq_bfqq_sync(bfqq) && RQ_BIC(rq)->requests <= 1) { bfq_check_waker(bfqd, bfqq, now_ns); /* @@ -2400,7 +2404,11 @@ static void bfq_remove_request(struct request_queue *q, if (rq->queuelist.prev != &rq->queuelist) list_del_init(&rq->queuelist); bfqq->queued[sync]--; - bfqd->queued--; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued - 1); elv_rb_del(&bfqq->sort_list, rq); elv_rqhash_del(q, rq); @@ -2463,10 +2471,17 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, spin_lock_irq(&bfqd->lock); - if (bic) + if (bic) { + /* + * Make sure cgroup info is uptodate for current process before + * considering the merge. + */ + bfq_bic_update_cgroup(bic, bio); + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - else + } else { bfqd->bio_bfqq = NULL; + } bfqd->bio_bic = bic; ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); @@ -2496,8 +2511,6 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, return ELEVATOR_NO_MERGE; } -static struct bfq_queue *bfq_init_rq(struct request *rq); - static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { @@ -2506,7 +2519,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, blk_rq_pos(req) < blk_rq_pos(container_of(rb_prev(&req->rb_node), struct request, rb_node))) { - struct bfq_queue *bfqq = bfq_init_rq(req); + struct bfq_queue *bfqq = RQ_BFQQ(req); struct bfq_data *bfqd; struct request *prev, *next_rq; @@ -2558,8 +2571,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { - struct bfq_queue *bfqq = bfq_init_rq(rq), - *next_bfqq = bfq_init_rq(next); + struct bfq_queue *bfqq = RQ_BFQQ(rq), + *next_bfqq = RQ_BFQQ(next); if (!bfqq) goto remove; @@ -2764,6 +2777,14 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) if (process_refs == 0 || new_process_refs == 0) return NULL; + /* + * Make sure merged queues belong to the same parent. Parents could + * have changed since the time we decided the two queues are suitable + * for merging. + */ + if (new_bfqq->entity.parent != bfqq->entity.parent) + return NULL; + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", new_bfqq->pid); @@ -2901,9 +2922,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_queue *new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); - bic->stably_merged = true; - if (new_bfqq && new_bfqq->bic) - new_bfqq->bic->stably_merged = true; + if (new_bfqq) { + bic->stably_merged = true; + if (new_bfqq->bic) + new_bfqq->bic->stably_merged = + true; + } return new_bfqq; } else return NULL; @@ -5045,11 +5069,11 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; /* - * Avoiding lock: a race on bfqd->busy_queues should cause at + * Avoiding lock: a race on bfqd->queued should cause at * most a call to dispatch for nothing */ return !list_empty_careful(&bfqd->dispatch) || - bfq_tot_busy_queues(bfqd) > 0; + READ_ONCE(bfqd->queued); } static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) @@ -5310,7 +5334,7 @@ static void bfq_put_stable_ref(struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void bfq_put_cooperator(struct bfq_queue *bfqq) +void bfq_put_cooperator(struct bfq_queue *bfqq) { struct bfq_queue *__bfqq, *next; @@ -5716,14 +5740,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq; struct bfq_group *bfqg; - rcu_read_lock(); - - bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); - if (!bfqg) { - bfqq = &bfqd->oom_bfqq; - goto out; - } - + bfqg = bfq_bio_bfqg(bfqd, bio); if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); @@ -5769,8 +5786,6 @@ out: if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); - - rcu_read_unlock(); return bfqq; } @@ -6117,6 +6132,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q, unsigned int cmd_flags) {} #endif /* CONFIG_BFQ_CGROUP_DEBUG */ +static struct bfq_queue *bfq_init_rq(struct request *rq); + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { @@ -6132,18 +6149,15 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); + bfqq = bfq_init_rq(rq); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); blk_mq_free_requests(&free); return; } - spin_unlock_irq(&bfqd->lock); - trace_block_rq_insert(rq); - spin_lock_irq(&bfqd->lock); - bfqq = bfq_init_rq(rq); if (!bfqq || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); @@ -6360,12 +6374,6 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_schedule_dispatch(bfqd); } -static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) -{ - bfqq_request_freed(bfqq); - bfq_put_queue(bfqq); -} - /* * The processes associated with bfqq may happen to generate their * cumulative I/O at a lower rate than the rate at which the device @@ -6562,7 +6570,9 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); } - bfq_finish_requeue_request_body(bfqq); + bfqq_request_freed(bfqq); + bfq_put_queue(bfqq); + RQ_BIC(rq)->requests--; spin_unlock_irqrestore(&bfqd->lock, flags); /* @@ -6796,6 +6806,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq_request_allocated(bfqq); bfqq->ref++; + bic->requests++; bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); @@ -6892,8 +6903,8 @@ bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_bfqq_expire(bfqd, bfqq, true, reason); schedule_dispatch: - spin_unlock_irqrestore(&bfqd->lock, flags); bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(&bfqd->lock, flags); } /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 3b83e3d1c2e5..ca8177d7bf7c 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -468,6 +468,7 @@ struct bfq_io_cq { struct bfq_queue *stable_merge_bfqq; bool stably_merged; /* non splittable if true */ + unsigned int requests; /* Number of requests this process has in flight */ }; /** @@ -928,6 +929,8 @@ struct bfq_group { /* reference counter (see comments in bfq_bic_update_cgroup) */ int ref; + /* Is bfq_group still online? */ + bool online; struct bfq_entity entity; struct bfq_sched_data sched_data; @@ -979,6 +982,7 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); +void bfq_put_cooperator(struct bfq_queue *bfqq); void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_schedule_dispatch(struct bfq_data *bfqd); @@ -1006,8 +1010,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_end_wr_async(struct bfq_data *bfqd); -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg); +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio); struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); @@ -1100,13 +1103,13 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); break; \ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ + &bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \ "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ + &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ } while (0) #else /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/bio.c b/block/bio.c index 5561a24784bb..3470d1a57c2e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -224,24 +224,13 @@ EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; - void *p; + void *p = bio; + + WARN_ON_ONCE(!bs); bio_uninit(bio); - - if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); - - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - p -= bs->front_pad; - - mempool_free(p, &bs->bio_pool); - } else { - /* Bio was allocated by bio_kmalloc() */ - kfree(bio); - } + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + mempool_free(p - bs->front_pad, &bs->bio_pool); } /* @@ -422,6 +411,28 @@ static void punt_bios_to_rescuer(struct bio_set *bs) queue_work(bs->rescue_workqueue, &bs->rescue_work); } +static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, + unsigned short nr_vecs, unsigned int opf, gfp_t gfp, + struct bio_set *bs) +{ + struct bio_alloc_cache *cache; + struct bio *bio; + + cache = per_cpu_ptr(bs->cache, get_cpu()); + if (!cache->free_list) { + put_cpu(); + return NULL; + } + bio = cache->free_list; + cache->free_list = bio->bi_next; + cache->nr--; + put_cpu(); + + bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + bio->bi_pool = bs; + return bio; +} + /** * bio_alloc_bioset - allocate a bio for I/O * @bdev: block device to allocate the bio for (can be %NULL) @@ -454,6 +465,9 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * + * If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process + * context, not hard/soft IRQ. + * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, @@ -468,6 +482,21 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; + if (opf & REQ_ALLOC_CACHE) { + if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { + bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, + gfp_mask, bs); + if (bio) + return bio; + /* + * No cached bio available, bio returned below marked with + * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache. + */ + } else { + opf &= ~REQ_ALLOC_CACHE; + } + } + /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be @@ -531,28 +560,28 @@ err_free: EXPORT_SYMBOL(bio_alloc_bioset); /** - * bio_kmalloc - kmalloc a bio for I/O + * bio_kmalloc - kmalloc a bio + * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate * - * Use kmalloc to allocate and initialize a bio. + * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized + * using bio_init() before use. To free a bio returned from this function use + * kfree() after calling bio_uninit(). A bio returned from this function can + * be reused by calling bio_uninit() before calling bio_init() again. + * + * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this + * function are not backed by a mempool can can fail. Do not use this function + * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) +struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_iovecs > UIO_MAXIOV) + if (nr_vecs > UIO_MAXIOV) return NULL; - - bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - if (unlikely(!bio)) - return NULL; - bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, - 0); - bio->bi_pool = NULL; - return bio; + return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); @@ -714,7 +743,7 @@ void bio_put(struct bio *bio) return; } - if (bio_flagged(bio, BIO_PERCPU_CACHE)) { + if (bio->bi_opf & REQ_ALLOC_CACHE) { struct bio_alloc_cache *cache; bio_uninit(bio); @@ -735,14 +764,15 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); - if (bio->bi_bdev == bio_src->bi_bdev && - bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter = bio_src->bi_iter; - bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); + if (bio->bi_bdev) { + if (bio->bi_bdev == bio_src->bi_bdev && + bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); + bio_clone_blkg_association(bio, bio_src); + } if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; @@ -1730,55 +1760,13 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) flags |= BIOSET_NEED_BVECS; if (src->rescue_workqueue) flags |= BIOSET_NEED_RESCUER; + if (src->cache) + flags |= BIOSET_PERCPU_CACHE; return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags); } EXPORT_SYMBOL(bioset_init_from_src); -/** - * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb - * @kiocb: kiocb describing the IO - * @bdev: block device to allocate the bio for (can be %NULL) - * @nr_vecs: number of iovecs to pre-allocate - * @opf: operation and flags for bio - * @bs: bio_set to allocate from - * - * Description: - * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only - * used to check if we should dip into the per-cpu bio_set allocation - * cache. The allocation uses GFP_KERNEL internally. On return, the - * bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio - * MUST be done from process context, not hard/soft IRQ. - * - */ -struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, - unsigned short nr_vecs, unsigned int opf, struct bio_set *bs) -{ - struct bio_alloc_cache *cache; - struct bio *bio; - - if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS) - return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs); - - cache = per_cpu_ptr(bs->cache, get_cpu()); - if (cache->free_list) { - bio = cache->free_list; - cache->free_list = bio->bi_next; - cache->nr--; - put_cpu(); - bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, - nr_vecs, opf); - bio->bi_pool = bs; - bio_set_flag(bio, BIO_PERCPU_CACHE); - return bio; - } - put_cpu(); - bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs); - bio_set_flag(bio, BIO_PERCPU_CACHE); - return bio; -} -EXPORT_SYMBOL_GPL(bio_alloc_kiocb); - static int __init init_bio(void) { int i; diff --git a/block/blk-cgroup-fc-appid.c b/block/blk-cgroup-fc-appid.c new file mode 100644 index 000000000000..760a2e1878dd --- /dev/null +++ b/block/blk-cgroup-fc-appid.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "blk-cgroup.h" + +/** + * blkcg_set_fc_appid - set the fc_app_id field associted to blkcg + * @app_id: application identifier + * @cgrp_id: cgroup id + * @app_id_len: size of application identifier + */ +int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + struct blkcg *blkcg; + int ret = 0; + + if (app_id_len > FC_APPID_LEN) + return -EINVAL; + + cgrp = cgroup_get_from_id(cgrp_id); + if (!cgrp) + return -ENOENT; + css = cgroup_get_e_css(cgrp, &io_cgrp_subsys); + if (!css) { + ret = -ENOENT; + goto out_cgrp_put; + } + blkcg = css_to_blkcg(css); + /* + * There is a slight race condition on setting the appid. + * Worst case an I/O may not find the right id. + * This is no different from the I/O we let pass while obtaining + * the vmid from the fabric. + * Adding the overhead of a lock is not necessary. + */ + strlcpy(blkcg->fc_app_id, app_id, app_id_len); + css_put(css); +out_cgrp_put: + cgroup_put(cgrp); + return ret; +} +EXPORT_SYMBOL_GPL(blkcg_set_fc_appid); + +/** + * blkcg_get_fc_appid - get the fc app identifier associated with a bio + * @bio: target bio + * + * On success return the fc_app_id, on failure return NULL + */ +char *blkcg_get_fc_appid(struct bio *bio) +{ + if (!bio->bi_blkg || bio->bi_blkg->blkcg->fc_app_id[0] == '\0') + return NULL; + return bio->bi_blkg->blkcg->fc_app_id; +} +EXPORT_SYMBOL_GPL(blkcg_get_fc_appid); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8dfe62786cd5..40161a3f68d0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -59,6 +59,23 @@ static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 +/** + * blkcg_css - find the current css + * + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. + */ +static struct cgroup_subsys_state *blkcg_css(void) +{ + struct cgroup_subsys_state *css; + + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -155,6 +172,33 @@ static void blkg_async_bio_workfn(struct work_struct *work) blk_finish_plug(&plug); } +/** + * bio_blkcg_css - return the blkcg CSS associated with a bio + * @bio: target bio + * + * This returns the CSS for the blkcg associated with a bio, or %NULL if not + * associated. Callers are expected to either handle %NULL or know association + * has been done prior to calling this. + */ +struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) +{ + if (!bio || !bio->bi_blkg) + return NULL; + return &bio->bi_blkg->blkcg->css; +} +EXPORT_SYMBOL_GPL(bio_blkcg_css); + +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -254,7 +298,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *blkg; int i, ret; - WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(&q->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ @@ -905,7 +948,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; - bool has_stats = false; const char *dname; unsigned seq; int i; @@ -931,14 +973,12 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { - has_stats = true; seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); @@ -950,12 +990,10 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (!blkg->pd[i] || !pol->pd_stat_fn) continue; - if (pol->pd_stat_fn(blkg->pd[i], s)) - has_stats = true; + pol->pd_stat_fn(blkg->pd[i], s); } - if (has_stats) - seq_printf(s, "\n"); + seq_puts(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) @@ -994,6 +1032,13 @@ static struct cftype blkcg_legacy_files[] = { { } /* terminate */ }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) +{ + return &css_to_blkcg(css)->cgwb_list; +} +#endif + /* * blkcg destruction is a three-stage process. * @@ -1015,25 +1060,6 @@ static struct cftype blkcg_legacy_files[] = { * This finally frees the blkcg. */ -/** - * blkcg_css_offline - cgroup css_offline callback - * @css: css of interest - * - * This function is called when @css is about to go away. Here the cgwbs are - * offlined first and only once writeback associated with the blkcg has - * finished do we start step 2 (see above). - */ -static void blkcg_css_offline(struct cgroup_subsys_state *css) -{ - struct blkcg *blkcg = css_to_blkcg(css); - - /* this prevents anyone from attaching or migrating to this blkcg */ - wb_blkcg_offline(blkcg); - - /* put the base online pin allowing step 2 to be triggered */ - blkcg_unpin_online(blkcg); -} - /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest @@ -1045,7 +1071,7 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) * * This is the blkcg counterpart of ioc_release_fn(). */ -void blkcg_destroy_blkgs(struct blkcg *blkcg) +static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); @@ -1075,6 +1101,57 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg) spin_unlock_irq(&blkcg->lock); } +/** + * blkcg_pin_online - pin online state + * @blkcg_css: blkcg of interest + * + * While pinned, a blkcg is kept online. This is primarily used to + * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline + * while an associated cgwb is still active. + */ +void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) +{ + refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); +} + +/** + * blkcg_unpin_online - unpin online state + * @blkcg_css: blkcg of interest + * + * This is primarily used to impedance-match blkg and cgwb lifetimes so + * that blkg doesn't go offline while an associated cgwb is still active. + * When this count goes to zero, all active cgwbs have finished so the + * blkcg can continue destruction by calling blkcg_destroy_blkgs(). + */ +void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) +{ + struct blkcg *blkcg = css_to_blkcg(blkcg_css); + + do { + if (!refcount_dec_and_test(&blkcg->online_pin)) + break; + blkcg_destroy_blkgs(blkcg); + blkcg = blkcg_parent(blkcg); + } while (blkcg); +} + +/** + * blkcg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away. Here the cgwbs are + * offlined first and only once writeback associated with the blkcg has + * finished do we start step 2 (see above). + */ +static void blkcg_css_offline(struct cgroup_subsys_state *css) +{ + /* this prevents anyone from attaching or migrating to this blkcg */ + wb_blkcg_offline(css); + + /* put the base online pin allowing step 2 to be triggered */ + blkcg_unpin_online(css); +} + static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); @@ -1163,8 +1240,7 @@ unlock: static int blkcg_css_online(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg *parent = blkcg_parent(blkcg); + struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs @@ -1172,7 +1248,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) * parent so that offline always happens towards the root. */ if (parent) - blkcg_pin_online(parent); + blkcg_pin_online(css); return 0; } @@ -1201,14 +1277,13 @@ int blkcg_init_queue(struct request_queue *q) preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ - rcu_read_lock(); + /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, q, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); @@ -1234,7 +1309,6 @@ err_destroy_all: return ret; err_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); @@ -1726,7 +1800,6 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) void blkcg_maybe_throttle_current(void) { struct request_queue *q = current->throttle_queue; - struct cgroup_subsys_state *css; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; @@ -1738,12 +1811,7 @@ void blkcg_maybe_throttle_current(void) current->use_memdelay = false; rcu_read_lock(); - css = kthread_blkcg(); - if (css) - blkcg = css_to_blkcg(css); - else - blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); - + blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, q); @@ -1889,7 +1957,7 @@ void bio_associate_blkg(struct bio *bio) rcu_read_lock(); if (bio->bi_blkg) - css = &bio_blkcg(bio)->css; + css = bio_blkcg_css(bio); else css = blkcg_css(); @@ -1950,6 +2018,22 @@ void blk_cgroup_bio_start(struct bio *bio) put_cpu(); } +bool blk_cgroup_congested(void) +{ + struct cgroup_subsys_state *css; + bool ret = false; + + rcu_read_lock(); + for (css = blkcg_css(); css; css = css->parent) { + if (atomic_read(&css->cgroup->congestion_count)) { + ret = true; + break; + } + } + rcu_read_unlock(); + return ret; +} + static int __init blkcg_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 47e1e38390c9..d4de0a35e066 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,13 +15,101 @@ */ #include +#include +#include #include +struct blkcg_gq; +struct blkg_policy_data; + + /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP +enum blkg_iostat_type { + BLKG_IOSTAT_READ, + BLKG_IOSTAT_WRITE, + BLKG_IOSTAT_DISCARD, + + BLKG_IOSTAT_NR, +}; + +struct blkg_iostat { + u64 bytes[BLKG_IOSTAT_NR]; + u64 ios[BLKG_IOSTAT_NR]; +}; + +struct blkg_iostat_set { + struct u64_stats_sync sync; + struct blkg_iostat cur; + struct blkg_iostat last; +}; + +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* reference count */ + struct percpu_ref refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_iostat_set __percpu *iostat_cpu; + struct blkg_iostat_set iostat; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + spinlock_t async_bio_lock; + struct bio_list async_bios; + union { + struct work_struct async_bio_work; + struct work_struct free_work; + }; + + atomic_t use_delay; + atomic64_t delay_nsec; + atomic64_t delay_start; + u64 last_delay; + int last_use; + + struct rcu_head rcu_head; +}; + +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + refcount_t online_pin; + + struct radix_tree_root blkg_tree; + struct blkcg_gq __rcu *blkg_hint; + struct hlist_head blkg_list; + + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; + + struct list_head all_blkcgs_node; +#ifdef CONFIG_BLK_CGROUP_FC_APPID + char fc_app_id[FC_APPID_LEN]; +#endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif +}; + +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track @@ -63,7 +151,7 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); -typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, +typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { @@ -122,53 +210,15 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); -/** - * blkcg_css - find the current css - * - * Find the css associated with either the kthread or the current task. - * This may return a dying css, so it is up to the caller to use tryget logic - * to confirm it is alive and well. - */ -static inline struct cgroup_subsys_state *blkcg_css(void) -{ - struct cgroup_subsys_state *css; - - css = kthread_blkcg(); - if (css) - return css; - return task_css(current, io_cgrp_id); -} - -/** - * __bio_blkcg - internal, inconsistent version to get blkcg - * - * DO NOT USE. - * This function is inconsistent and consequently is dangerous to use. The - * first part of the function returns a blkcg where a reference is owned by the - * bio. This means it does not need to be rcu protected as it cannot go away - * with the bio owning a reference to it. However, the latter potentially gets - * it from task_css(). This can race against task migration and the cgroup - * dying. It is also semantically different as it must be called rcu protected - * and is susceptible to failure when trying to get a reference to it. - * Therefore, it is not ok to assume that *_get() will always succeed on the - * blkcg returned here. - */ -static inline struct blkcg *__bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return css_to_blkcg(blkcg_css()); -} - /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning - * blkg. The idea is we do bio_blkcg() to look up the actual context for the - * bio and attach the appropriate blkg to the bio. Then we call this helper and - * if it is true run with the root blkg for that queue and then do any + * blkg. The idea is we do bio_blkcg_css() to look up the actual context for + * the bio and attach the appropriate blkg to the bio. Then we call this helper + * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) @@ -457,7 +507,8 @@ struct blkcg_policy_data { struct blkcg_policy { }; -#ifdef CONFIG_BLOCK +struct blkcg { +}; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) @@ -471,8 +522,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } - static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } @@ -488,7 +537,6 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { r #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) -#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */ diff --git a/block/blk-core.c b/block/blk-core.c index bc0506772152..80fa73c419a9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -588,10 +588,9 @@ static inline int bio_check_eod(struct bio *bio) (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { pr_info_ratelimited("%s: attempt to access beyond end of device\n" - "%pg: rw=%d, want=%llu, limit=%llu\n", - current->comm, - bio->bi_bdev, bio->bi_opf, - bio_end_sector(bio), maxsector); + "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", + current->comm, bio->bi_bdev, bio->bi_opf, + bio->bi_iter.bi_sector, nr_sectors, maxsector); return -EIO; } return 0; @@ -816,11 +815,11 @@ void submit_bio_noacct(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_DISCARD: - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) goto not_supported; break; case REQ_OP_SECURE_ERASE: - if (!blk_queue_secure_erase(q)) + if (!bdev_max_secure_erase_sectors(bdev)) goto not_supported; break; case REQ_OP_ZONE_APPEND: @@ -889,19 +888,11 @@ void submit_bio(struct bio *bio) if (blkcg_punt_bio_submit(bio)) return; - /* - * If it's a regular read/write or a barrier with data attached, - * go through the normal accounting stuff before submission. - */ - if (bio_has_data(bio)) { - unsigned int count = bio_sectors(bio); - - if (op_is_write(bio_op(bio))) { - count_vm_events(PGPGOUT, count); - } else { - task_io_account_read(bio->bi_iter.bi_size); - count_vm_events(PGPGIN, count); - } + if (bio_op(bio) == REQ_OP_READ) { + task_io_account_read(bio->bi_iter.bi_size); + count_vm_events(PGPGIN, bio_sectors(bio)); + } else if (bio_op(bio) == REQ_OP_WRITE) { + count_vm_events(PGPGOUT, bio_sectors(bio)); } /* @@ -1018,21 +1009,22 @@ again: } } -static unsigned long __part_start_io_acct(struct block_device *part, - unsigned int sectors, unsigned int op, - unsigned long start_time) +unsigned long bdev_start_io_acct(struct block_device *bdev, + unsigned int sectors, unsigned int op, + unsigned long start_time) { const int sgrp = op_stat_group(op); part_stat_lock(); - update_io_ticks(part, start_time, false); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, sectors[sgrp], sectors); - part_stat_local_inc(part, in_flight[op_is_write(op)]); + update_io_ticks(bdev, start_time, false); + part_stat_inc(bdev, ios[sgrp]); + part_stat_add(bdev, sectors[sgrp], sectors); + part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); return start_time; } +EXPORT_SYMBOL(bdev_start_io_acct); /** * bio_start_io_acct_time - start I/O accounting for bio based drivers @@ -1041,8 +1033,8 @@ static unsigned long __part_start_io_acct(struct block_device *part, */ void bio_start_io_acct_time(struct bio *bio, unsigned long start_time) { - __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), start_time); + bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), + bio_op(bio), start_time); } EXPORT_SYMBOL_GPL(bio_start_io_acct_time); @@ -1054,46 +1046,33 @@ EXPORT_SYMBOL_GPL(bio_start_io_acct_time); */ unsigned long bio_start_io_acct(struct bio *bio) { - return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), jiffies); + return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), + bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); -unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, - unsigned int op) -{ - return __part_start_io_acct(disk->part0, sectors, op, jiffies); -} -EXPORT_SYMBOL(disk_start_io_acct); - -static void __part_end_io_acct(struct block_device *part, unsigned int op, - unsigned long start_time) +void bdev_end_io_acct(struct block_device *bdev, unsigned int op, + unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); - update_io_ticks(part, now, true); - part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_local_dec(part, in_flight[op_is_write(op)]); + update_io_ticks(bdev, now, true); + part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); } +EXPORT_SYMBOL(bdev_end_io_acct); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, - struct block_device *orig_bdev) + struct block_device *orig_bdev) { - __part_end_io_acct(orig_bdev, bio_op(bio), start_time); + bdev_end_io_acct(orig_bdev, bio_op(bio), start_time); } EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); -void disk_end_io_acct(struct gendisk *disk, unsigned int op, - unsigned long start_time) -{ - __part_end_io_acct(disk->part0, op, start_time); -} -EXPORT_SYMBOL(disk_end_io_acct); - /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 35ff9746552e..f9bfafdcbc55 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -152,23 +152,25 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) src_bio->bi_status = enc_bio->bi_status; - bio_put(enc_bio); + bio_uninit(enc_bio); + kfree(enc_bio); bio_endio(src_bio); } static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) { + unsigned int nr_segs = bio_segments(bio_src); struct bvec_iter iter; struct bio_vec bv; struct bio *bio; - bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); + bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio->bi_bdev = bio_src->bi_bdev; + bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, + bio_src->bi_opf); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); - bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -177,7 +179,6 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio->bi_io_vec[bio->bi_vcnt++] = bv; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); bio_clone_skip_dm_default_key(bio, bio_src); @@ -365,8 +366,8 @@ out_release_keyslot: blk_crypto_put_keyslot(slot); out_put_enc_bio: if (enc_bio) - bio_put(enc_bio); - + bio_uninit(enc_bio); + kfree(enc_bio); return ret; } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9bd670999d0a..33a11ba971ea 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -533,8 +533,7 @@ struct ioc_gq { /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; - struct iocg_stat local_stat; - struct iocg_stat desc_stat; + struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; @@ -1371,7 +1370,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return true; } else { if (iocg->indelay_since) { - iocg->local_stat.indelay_us += now->now - iocg->indelay_since; + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; @@ -1419,7 +1418,7 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { - iocg->local_stat.indebt_us += now->now - iocg->indebt_since; + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, @@ -1513,7 +1512,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; @@ -1641,11 +1640,30 @@ static void iocg_build_inner_walk(struct ioc_gq *iocg, } } +/* propagate the deltas to the parent */ +static void iocg_flush_stat_upward(struct ioc_gq *iocg) +{ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->stat; + + parent_stat->usage_us += + iocg->stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + iocg->stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + iocg->stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + iocg->stat.indelay_us - iocg->last_stat.indelay_us; + } + + iocg->last_stat = iocg->stat; +} + /* collect per-cpu counters and propagate the deltas to the parent */ -static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - struct iocg_stat new_stat; u64 abs_vusage = 0; u64 vusage_delta; int cpu; @@ -1661,34 +1679,9 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); - iocg->local_stat.usage_us += iocg->usage_delta_us; + iocg->stat.usage_us += iocg->usage_delta_us; - /* propagate upwards */ - new_stat.usage_us = - iocg->local_stat.usage_us + iocg->desc_stat.usage_us; - new_stat.wait_us = - iocg->local_stat.wait_us + iocg->desc_stat.wait_us; - new_stat.indebt_us = - iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; - new_stat.indelay_us = - iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; - - /* propagate the deltas to the parent */ - if (iocg->level > 0) { - struct iocg_stat *parent_stat = - &iocg->ancestors[iocg->level - 1]->desc_stat; - - parent_stat->usage_us += - new_stat.usage_us - iocg->last_stat.usage_us; - parent_stat->wait_us += - new_stat.wait_us - iocg->last_stat.wait_us; - parent_stat->indebt_us += - new_stat.indebt_us - iocg->last_stat.indebt_us; - parent_stat->indelay_us += - new_stat.indelay_us - iocg->last_stat.indelay_us; - } - - iocg->last_stat = new_stat; + iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ @@ -1699,13 +1692,13 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } @@ -2152,16 +2145,16 @@ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { - iocg->local_stat.indebt_us += + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { - iocg->local_stat.indelay_us += + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } @@ -3005,13 +2998,13 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) +static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; if (!ioc->enabled) - return false; + return; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( @@ -3027,7 +3020,6 @@ static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) iocg->last_stat.wait_us, iocg->last_stat.indebt_us, iocg->last_stat.indelay_us); - return true; } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f33932e72e3..5b676c7cf2b6 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -891,7 +891,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) +static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -914,17 +914,16 @@ static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total, iolat->rq_depth.max_depth); - return true; } -static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) +static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return false; + return; if (iolat->ssd) return iolatency_ssd_stat(iolat, s); @@ -937,7 +936,6 @@ static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) else seq_printf(s, " depth=%u avg_lat=%llu win=%llu", iolat->rq_depth.max_depth, avg_lat, cur_win); - return true; } static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, diff --git a/block/blk-lib.c b/block/blk-lib.c index 237d60d8b585..09b7e1200c0f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -10,30 +10,44 @@ #include "blk.h" -int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags, - struct bio **biop) +static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) +{ + unsigned int discard_granularity = bdev_discard_granularity(bdev); + sector_t granularity_aligned_sector; + + if (bdev_is_partition(bdev)) + sector += bdev->bd_start_sect; + + granularity_aligned_sector = + round_up(sector, discard_granularity >> SECTOR_SHIFT); + + /* + * Make sure subsequent bios start aligned to the discard granularity if + * it needs to be split. + */ + if (granularity_aligned_sector != sector) + return granularity_aligned_sector - sector; + + /* + * Align the bio size to the discard granularity to make splitting the bio + * at discard granularity boundaries easier in the driver if needed. + */ + return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; +} + +int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { - struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; - unsigned int op; - sector_t bs_mask, part_offset = 0; + sector_t bs_mask; if (bdev_read_only(bdev)) return -EPERM; - - if (flags & BLKDEV_DISCARD_SECURE) { - if (!blk_queue_secure_erase(q)) - return -EOPNOTSUPP; - op = REQ_OP_SECURE_ERASE; - } else { - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - op = REQ_OP_DISCARD; - } + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; /* In case the discard granularity isn't set by buggy device driver */ - if (WARN_ON_ONCE(!q->limits.discard_granularity)) { + if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { char dev_name[BDEVNAME_SIZE]; bdevname(bdev, dev_name); @@ -48,38 +62,11 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!nr_sects) return -EINVAL; - /* In case the discard request is in a partition */ - if (bdev_is_partition(bdev)) - part_offset = bdev->bd_start_sect; - while (nr_sects) { - sector_t granularity_aligned_lba, req_sects; - sector_t sector_mapped = sector + part_offset; + sector_t req_sects = + min(nr_sects, bio_discard_limit(bdev, sector)); - granularity_aligned_lba = round_up(sector_mapped, - q->limits.discard_granularity >> SECTOR_SHIFT); - - /* - * Check whether the discard bio starts at a discard_granularity - * aligned LBA, - * - If no: set (granularity_aligned_lba - sector_mapped) to - * bi_size of the first split bio, then the second bio will - * start at a discard_granularity aligned LBA on the device. - * - If yes: use bio_aligned_discard_max_sectors() as the max - * possible bi_size of the first split bio. Then when this bio - * is split in device drive, the split ones are very probably - * to be aligned to discard_granularity of the device's queue. - */ - if (granularity_aligned_lba == sector_mapped) - req_sects = min_t(sector_t, nr_sects, - bio_aligned_discard_max_sectors(q)); - else - req_sects = min_t(sector_t, nr_sects, - granularity_aligned_lba - sector_mapped); - - WARN_ON_ONCE((req_sects << 9) > UINT_MAX); - - bio = blk_next_bio(bio, bdev, 0, op, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_size = req_sects << 9; sector += req_sects; @@ -105,21 +92,19 @@ EXPORT_SYMBOL(__blkdev_issue_discard); * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_DISCARD_* flags to control behaviour * * Description: * Issue a discard request for the sectors in question. */ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) + sector_t nr_sects, gfp_t gfp_mask) { struct bio *bio = NULL; struct blk_plug plug; int ret; blk_start_plug(&plug); - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, - &bio); + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); if (!ret && bio) { ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) @@ -316,3 +301,42 @@ retry: return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); + +int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp) +{ + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + unsigned int max_sectors = bdev_max_secure_erase_sectors(bdev); + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + if (max_sectors == 0) + return -EOPNOTSUPP; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + if (bdev_read_only(bdev)) + return -EPERM; + + blk_start_plug(&plug); + for (;;) { + unsigned int len = min_t(sector_t, nr_sects, max_sectors); + + bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = len; + + sector += len << SECTOR_SHIFT; + nr_sects -= len << SECTOR_SHIFT; + if (!nr_sects) { + ret = submit_bio_wait(bio); + bio_put(bio); + break; + } + cond_resched(); + } + blk_finish_plug(&plug); + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_secure_erase); diff --git a/block/blk-map.c b/block/blk-map.c index c7f71d83eff1..df8b066cd548 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -152,10 +152,10 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE)); ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - bio->bi_opf |= req_op(rq); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1 << map_data->page_order; @@ -224,7 +224,8 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, cleanup: if (!map_data) bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); out_bmd: kfree(bmd); return ret; @@ -234,6 +235,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { unsigned int max_sectors = queue_max_hw_sectors(rq->q); + unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; int j; @@ -241,10 +243,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_VECS)); + bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return -ENOMEM; - bio->bi_opf |= req_op(rq); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); while (iov_iter_count(iter)) { struct page **pages; @@ -260,10 +262,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - if (unlikely(offs & queue_dma_alignment(rq->q))) { - ret = -EINVAL; + if (unlikely(offs & queue_dma_alignment(rq->q))) j = 0; - } else { + else { for (j = 0; j < npages; j++) { struct page *page = pages[j]; unsigned int n = PAGE_SIZE - offs; @@ -303,7 +304,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ret; } @@ -323,7 +325,8 @@ static void bio_invalidate_vmalloc_pages(struct bio *bio) static void bio_map_kern_endio(struct bio *bio) { bio_invalidate_vmalloc_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } /** @@ -348,9 +351,10 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, int offset, i; struct bio *bio; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); if (is_vmalloc) { flush_kernel_vmap_range(data, len); @@ -374,7 +378,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ERR_PTR(-EINVAL); } @@ -390,7 +395,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, static void bio_copy_kern_endio(struct bio *bio) { bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } static void bio_copy_kern_endio_read(struct bio *bio) @@ -435,9 +441,10 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, return ERR_PTR(-EINVAL); nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); while (len) { struct page *page; @@ -471,7 +478,8 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, cleanup: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ERR_PTR(-ENOMEM); } @@ -602,7 +610,8 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_put(next_bio); + bio_uninit(next_bio); + kfree(next_bio); } return ret; @@ -648,8 +657,10 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf |= req_op(rq); ret = blk_rq_append_bio(rq, bio); - if (unlikely(ret)) - bio_put(bio); + if (unlikely(ret)) { + bio_uninit(bio); + kfree(bio); + } return ret; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index aa0349e9f083..7e4136a60e1c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -113,10 +113,8 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), - QUEUE_FLAG_NAME(DISCARD), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), - QUEUE_FLAG_NAME(SECERASE), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(INIT_DONE), diff --git a/block/blk-mq.c b/block/blk-mq.c index ed1869a305c4..ae116b755648 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1083,7 +1083,7 @@ bool blk_mq_complete_request_remote(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* - * For a polled request, always complete locallly, it's pointless + * For a polled request, always complete locally, it's pointless * to redirect the completion. */ if (rq->cmd_flags & REQ_POLLED) diff --git a/block/blk-settings.c b/block/blk-settings.c index b83df3d2eebc..6ccceb421ed2 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -46,6 +46,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; + lim->max_secure_erase_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; @@ -176,6 +177,18 @@ void blk_queue_max_discard_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_discard_sectors); +/** + * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase + * @q: the request queue for the device + * @max_sectors: maximum number of sectors to secure_erase + **/ +void blk_queue_max_secure_erase_sectors(struct request_queue *q, + unsigned int max_sectors) +{ + q->limits.max_secure_erase_sectors = max_sectors; +} +EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); + /** * blk_queue_max_write_zeroes_sectors - set max sectors for a single * write zeroes @@ -468,6 +481,40 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); +static int queue_limit_alignment_offset(struct queue_limits *lim, + sector_t sector) +{ + unsigned int granularity = max(lim->physical_block_size, lim->io_min); + unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) + << SECTOR_SHIFT; + + return (granularity + lim->alignment_offset - alignment) % granularity; +} + +static unsigned int queue_limit_discard_alignment(struct queue_limits *lim, + sector_t sector) +{ + unsigned int alignment, granularity, offset; + + if (!lim->max_discard_sectors) + return 0; + + /* Why are these in bytes, not sectors? */ + alignment = lim->discard_alignment >> SECTOR_SHIFT; + granularity = lim->discard_granularity >> SECTOR_SHIFT; + if (!granularity) + return 0; + + /* Offset of the partition start in 'granularity' sectors */ + offset = sector_div(sector, granularity); + + /* And why do we do this modulus *again* in blkdev_issue_discard()? */ + offset = (granularity + alignment - offset) % granularity; + + /* Turn it back into bytes, gaah */ + return offset << SECTOR_SHIFT; +} + static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) { sectors = round_down(sectors, lbs >> SECTOR_SHIFT); @@ -627,7 +674,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % t->discard_granularity; } - + t->max_secure_erase_sectors = min_not_zero(t->max_secure_erase_sectors, + b->max_secure_erase_sectors); t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); @@ -901,3 +949,27 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); + +int bdev_alignment_offset(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q->limits.misaligned) + return -1; + if (bdev_is_partition(bdev)) + return queue_limit_alignment_offset(&q->limits, + bdev->bd_start_sect); + return q->limits.alignment_offset; +} +EXPORT_SYMBOL_GPL(bdev_alignment_offset); + +unsigned int bdev_discard_alignment(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (bdev_is_partition(bdev)) + return queue_limit_discard_alignment(&q->limits, + bdev->bd_start_sect); + return q->limits.discard_alignment; +} +EXPORT_SYMBOL_GPL(bdev_discard_alignment); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 469c483719be..139b2d7a99e2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -227,7 +227,7 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ - tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\ + &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ @@ -2189,13 +2189,14 @@ again: } out_unlock: - spin_unlock_irq(&q->queue_lock); bio_set_flag(bio, BIO_THROTTLED); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); return throttled; } diff --git a/block/blk.h b/block/blk.h index 8ccbc6e07636..434017701403 100644 --- a/block/blk.h +++ b/block/blk.h @@ -346,20 +346,6 @@ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; } -/* - * The max bio size which is aligned to q->limits.discard_granularity. This - * is a hint to split large discard bio in generic block layer, then if device - * driver needs to split the discard bio into smaller ones, their bi_size can - * be very probably and easily aligned to discard_granularity of the device's - * queue. - */ -static inline unsigned int bio_aligned_discard_max_sectors( - struct request_queue *q) -{ - return round_down(UINT_MAX, q->limits.discard_granularity) >> - SECTOR_SHIFT; -} - /* * Internal io_context interface */ @@ -450,13 +436,6 @@ extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; -static inline void bio_clear_polled(struct bio *bio) -{ - /* can't support alloc cache if we turn off polling */ - bio_clear_flag(bio, BIO_PERCPU_CACHE); - bio->bi_opf &= ~REQ_POLLED; -} - long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/block/bounce.c b/block/bounce.c index 467be46d0e65..8f7b6fe3b4db 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -191,7 +191,6 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) goto err_put; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); return bio; diff --git a/block/fops.c b/block/fops.c index 9f2ecec406b0..b9b83030e0df 100644 --- a/block/fops.c +++ b/block/fops.c @@ -44,14 +44,6 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) #define DIO_INLINE_BIO_VECS 4 -static void blkdev_bio_end_io_simple(struct bio *bio) -{ - struct task_struct *waiter = bio->bi_private; - - WRITE_ONCE(bio->bi_private, NULL); - blk_wake_io_task(waiter); -} - static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { @@ -83,8 +75,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; - bio.bi_private = current; - bio.bi_end_io = blkdev_bio_end_io_simple; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); @@ -97,18 +87,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(&bio, iocb); - submit_bio(&bio); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio.bi_private)) - break; - if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); + submit_bio_wait(&bio); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) @@ -197,8 +177,10 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool); - + if (iocb->ki_flags & IOCB_ALLOC_CACHE) + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, + &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); atomic_set(&dio->ref, 1); /* @@ -320,7 +302,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool); + if (iocb->ki_flags & IOCB_ALLOC_CACHE) + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, + &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->flags = 0; dio->iocb = iocb; @@ -672,7 +657,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, 0); + len >> SECTOR_SHIFT, GFP_KERNEL); break; default: error = -EOPNOTSUPP; diff --git a/block/genhd.c b/block/genhd.c index b8b6759d670f..36532b931841 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1010,7 +1010,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); + return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, @@ -1019,7 +1019,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); + return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, diff --git a/block/ioctl.c b/block/ioctl.c index f8703db99c73..46949f1b0dba 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -83,18 +83,17 @@ static int compat_blkpg_ioctl(struct block_device *bdev, #endif static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, - unsigned long arg, unsigned long flags) + unsigned long arg) { uint64_t range[2]; uint64_t start, len; - struct request_queue *q = bdev_get_queue(bdev); struct inode *inode = bdev->bd_inode; int err; if (!(mode & FMODE_WRITE)) return -EBADF; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(range, (void __user *)arg, sizeof(range))) @@ -115,15 +114,43 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; - - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, flags); - + err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); fail: filemap_invalidate_unlock(inode->i_mapping); return err; } +static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, + void __user *argp) +{ + uint64_t start, len; + uint64_t range[2]; + int err; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + if (!bdev_max_secure_erase_sectors(bdev)) + return -EOPNOTSUPP; + if (copy_from_user(range, argp, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + if ((start & 511) || (len & 511)) + return -EINVAL; + if (start + len > bdev_nr_bytes(bdev)) + return -EINVAL; + + filemap_invalidate_lock(bdev->bd_inode->i_mapping); + err = truncate_bdev_range(bdev, mode, start, start + len - 1); + if (!err) + err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, + GFP_KERNEL); + filemap_invalidate_unlock(bdev->bd_inode->i_mapping); + return err; +} + + static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, unsigned long arg) { @@ -451,10 +478,9 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKROSET: return blkdev_roset(bdev, mode, cmd, arg); case BLKDISCARD: - return blk_ioctl_discard(bdev, mode, arg, 0); + return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: - return blk_ioctl_discard(bdev, mode, arg, - BLKDEV_DISCARD_SECURE); + return blk_ioctl_secure_erase(bdev, mode, argp); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); case BLKGETDISKSEQ: @@ -489,7 +515,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(argp, !blk_queue_nonrot(bdev_get_queue(bdev))); + return put_ushort(argp, !bdev_nonrot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index 2c381c694c57..d2fc122d7426 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -282,13 +282,13 @@ int adfspart_check_ADFS(struct parsed_partitions *state) #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, start_sect, slot, + riscix_partition(state, start_sect, slot, nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, start_sect, slot, + linux_partition(state, start_sect, slot, nr_sects); break; } diff --git a/block/partitions/atari.c b/block/partitions/atari.c index da5994175416..9655c728262a 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -140,7 +140,6 @@ int atari_partition(struct parsed_partitions *state) /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) continue; - part_fmt = 2; put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); diff --git a/block/partitions/core.c b/block/partitions/core.c index 2ef8dfa1e5c8..8a0ec929023b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -200,21 +200,13 @@ static ssize_t part_ro_show(struct device *dev, static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = dev_to_bdev(dev); - - return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits, - bdev->bd_start_sect)); + return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = dev_to_bdev(dev); - - return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits, - bdev->bd_start_sect)); + return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -486,7 +478,7 @@ int bdev_del_partition(struct gendisk *disk, int partno) goto out_unlock; ret = -EBUSY; - if (part->bd_openers) + if (atomic_read(&part->bd_openers)) goto out_unlock; delete_partition(part); diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 27f6c7d9c776..38e58960ae03 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -736,7 +736,6 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) len = r_cols; } else { r_stripe = 0; - r_cols = 0; len = r_parent; } if (len < 0) @@ -783,11 +782,8 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; + } else len = r_diskid; - } if (len < 0) return false; @@ -826,11 +822,8 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; + } else len = r_name; - } if (len < 0) return false; @@ -963,10 +956,8 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) return false; } len = r_index; - } else { - r_index = 0; + } else len = r_diskid; - } if (len < 0) { ldm_error("len %d < 0", len); return false; diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 84d0fcebd6af..749ae1246f4c 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -244,3 +244,5 @@ void aoenet_exit(void); void aoenet_xmit(struct sk_buff_head *); int is_aoe_netif(struct net_device *ifp); int set_aoe_iflist(const char __user *str, size_t size); + +extern struct workqueue_struct *aoe_wq; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 8a91fcac6f82..348adf335217 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -435,7 +435,7 @@ err_mempool: err: spin_lock_irqsave(&d->lock, flags); d->flags &= ~DEVFL_GD_NOW; - schedule_work(&d->work); + queue_work(aoe_wq, &d->work); spin_unlock_irqrestore(&d->lock, flags); } diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 384073ef2323..d7317425be51 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -968,7 +968,7 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id) d->flags |= DEVFL_NEWSIZE; else d->flags |= DEVFL_GDALLOC; - schedule_work(&d->work); + queue_work(aoe_wq, &d->work); } static void diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index c5753c6bfe80..b381d1c3ef32 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -321,7 +321,7 @@ flush(const char __user *str, size_t cnt, int exiting) specified = 1; } - flush_scheduled_work(); + flush_workqueue(aoe_wq); /* pass one: do aoedev_downdev, which might sleep */ restart1: spin_lock_irqsave(&devlist_lock, flags); @@ -520,7 +520,7 @@ freetgt(struct aoedev *d, struct aoetgt *t) void aoedev_exit(void) { - flush_scheduled_work(); + flush_workqueue(aoe_wq); flush(NULL, 0, EXITING); } diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index 1e4e2971171c..6238c4c87cfc 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -16,6 +16,7 @@ MODULE_DESCRIPTION("AoE block/char driver for 2.6.2 and newer 2.6 kernels"); MODULE_VERSION(VERSION); static struct timer_list timer; +struct workqueue_struct *aoe_wq; static void discover_timer(struct timer_list *t) { @@ -35,6 +36,7 @@ aoe_exit(void) aoechr_exit(); aoedev_exit(); aoeblk_exit(); /* free cache after de-allocating bufs */ + destroy_workqueue(aoe_wq); } static int __init @@ -42,9 +44,13 @@ aoe_init(void) { int ret; + aoe_wq = alloc_workqueue("aoe_wq", 0, 0); + if (!aoe_wq) + return -ENOMEM; + ret = aoedev_init(); if (ret) - return ret; + goto dev_fail; ret = aoechr_init(); if (ret) goto chr_fail; @@ -77,6 +83,8 @@ aoe_init(void) aoechr_exit(); chr_fail: aoedev_exit(); + dev_fail: + destroy_workqueue(aoe_wq); printk(KERN_INFO "aoe: initialisation failure.\n"); return ret; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index df25eecf80af..9e060e49b3f8 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -683,7 +683,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi } } - want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; + want = PFN_UP(words*sizeof(long)); have = b->bm_number_of_pages; if (want == have) { D_ASSERT(device, b->bm_pages != NULL); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 4b0b25cc916e..2887350ae010 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -903,31 +903,6 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) } } -/* communicated if (agreed_features & DRBD_FF_WSAME) */ -static void -assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, - struct request_queue *q) -{ - if (q) { - p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); - p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q)); - p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q)); - p->qlim->io_min = cpu_to_be32(queue_io_min(q)); - p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); - p->qlim->discard_enabled = blk_queue_discard(q); - p->qlim->write_same_capable = 0; - } else { - q = device->rq_queue; - p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); - p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q)); - p->qlim->alignment_offset = 0; - p->qlim->io_min = cpu_to_be32(queue_io_min(q)); - p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); - p->qlim->discard_enabled = 0; - p->qlim->write_same_capable = 0; - } -} - int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) { struct drbd_device *device = peer_device->device; @@ -949,7 +924,9 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu memset(p, 0, packet_size); if (get_ldev_if_state(device, D_NEGOTIATING)) { - struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + struct block_device *bdev = device->ldev->backing_bdev; + struct request_queue *q = bdev_get_queue(bdev); + d_size = drbd_get_max_capacity(device->ldev); rcu_read_lock(); u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; @@ -957,14 +934,32 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu q_order_type = drbd_queue_order_type(device); max_bio_size = queue_max_hw_sectors(q) << 9; max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); - assign_p_sizes_qlim(device, p, q); + p->qlim->physical_block_size = + cpu_to_be32(bdev_physical_block_size(bdev)); + p->qlim->logical_block_size = + cpu_to_be32(bdev_logical_block_size(bdev)); + p->qlim->alignment_offset = + cpu_to_be32(bdev_alignment_offset(bdev)); + p->qlim->io_min = cpu_to_be32(bdev_io_min(bdev)); + p->qlim->io_opt = cpu_to_be32(bdev_io_opt(bdev)); + p->qlim->discard_enabled = !!bdev_max_discard_sectors(bdev); put_ldev(device); } else { + struct request_queue *q = device->rq_queue; + + p->qlim->physical_block_size = + cpu_to_be32(queue_physical_block_size(q)); + p->qlim->logical_block_size = + cpu_to_be32(queue_logical_block_size(q)); + p->qlim->alignment_offset = 0; + p->qlim->io_min = cpu_to_be32(queue_io_min(q)); + p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); + p->qlim->discard_enabled = 0; + d_size = 0; u_size = 0; q_order_type = QUEUE_ORDERED_NONE; max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ - assign_p_sizes_qlim(device, p, NULL); } if (peer_device->connection->agreed_pro_version <= 94) @@ -3586,9 +3581,8 @@ const char *cmdname(enum drbd_packet cmd) * when we want to support more than * one PRO_VERSION */ static const char *cmdnames[] = { + [P_DATA] = "Data", - [P_WSAME] = "WriteSame", - [P_TRIM] = "Trim", [P_DATA_REPLY] = "DataReply", [P_RS_DATA_REPLY] = "RSDataReply", [P_BARRIER] = "Barrier", @@ -3599,7 +3593,6 @@ const char *cmdname(enum drbd_packet cmd) [P_DATA_REQUEST] = "DataRequest", [P_RS_DATA_REQUEST] = "RSDataRequest", [P_SYNC_PARAM] = "SyncParam", - [P_SYNC_PARAM89] = "SyncParam89", [P_PROTOCOL] = "ReportProtocol", [P_UUIDS] = "ReportUUIDs", [P_SIZES] = "ReportSizes", @@ -3607,6 +3600,7 @@ const char *cmdname(enum drbd_packet cmd) [P_SYNC_UUID] = "ReportSyncUUID", [P_AUTH_CHALLENGE] = "AuthChallenge", [P_AUTH_RESPONSE] = "AuthResponse", + [P_STATE_CHG_REQ] = "StateChgRequest", [P_PING] = "Ping", [P_PING_ACK] = "PingAck", [P_RECV_ACK] = "RecvAck", @@ -3617,23 +3611,25 @@ const char *cmdname(enum drbd_packet cmd) [P_NEG_DREPLY] = "NegDReply", [P_NEG_RS_DREPLY] = "NegRSDReply", [P_BARRIER_ACK] = "BarrierAck", - [P_STATE_CHG_REQ] = "StateChgRequest", [P_STATE_CHG_REPLY] = "StateChgReply", [P_OV_REQUEST] = "OVRequest", [P_OV_REPLY] = "OVReply", [P_OV_RESULT] = "OVResult", [P_CSUM_RS_REQUEST] = "CsumRSRequest", [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", + [P_SYNC_PARAM89] = "SyncParam89", [P_COMPRESSED_BITMAP] = "CBitmap", [P_DELAY_PROBE] = "DelayProbe", [P_OUT_OF_SYNC] = "OutOfSync", - [P_RETRY_WRITE] = "RetryWrite", [P_RS_CANCEL] = "RSCancel", [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", [P_PROTOCOL_UPDATE] = "protocol_update", + [P_TRIM] = "Trim", [P_RS_THIN_REQ] = "rs_thin_req", [P_RS_DEALLOCATED] = "rs_deallocated", + [P_WSAME] = "WriteSame", + [P_ZEROES] = "Zeroes", /* enum drbd_packet, but not commands - obsoleted flags: * P_MAY_IGNORE diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index b7216c186ba4..013d355a2033 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -770,6 +770,7 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) struct set_role_parms parms; int err; enum drbd_ret_code retcode; + enum drbd_state_rv rv; retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) @@ -790,14 +791,14 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) - retcode = (enum drbd_ret_code)drbd_set_role(adm_ctx.device, - R_PRIMARY, parms.assume_uptodate); + rv = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); else - retcode = (enum drbd_ret_code)drbd_set_role(adm_ctx.device, - R_SECONDARY, 0); + rv = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); mutex_unlock(&adm_ctx.resource->adm_mutex); genl_lock(); + drbd_adm_finish(&adm_ctx, info, rv); + return 0; out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -1204,50 +1205,40 @@ static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) } static void decide_on_discard_support(struct drbd_device *device, - struct request_queue *q, - struct request_queue *b, - bool discard_zeroes_if_aligned) + struct drbd_backing_dev *bdev) { - /* q = drbd device queue (device->rq_queue) - * b = backing device queue (device->ldev->backing_bdev->bd_disk->queue), - * or NULL if diskless + struct drbd_connection *connection = + first_peer_device(device)->connection; + struct request_queue *q = device->rq_queue; + + if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev)) + goto not_supported; + + if (connection->cstate >= C_CONNECTED && + !(connection->agreed_features & DRBD_FF_TRIM)) { + drbd_info(connection, + "peer DRBD too old, does not support TRIM: disabling discards\n"); + goto not_supported; + } + + /* + * We don't care for the granularity, really. + * + * Stacking limits below should fix it for the local device. Whether or + * not it is a suitable granularity on the remote device is not our + * problem, really. If you care, you need to use devices with similar + * topology on all peers. */ - struct drbd_connection *connection = first_peer_device(device)->connection; - bool can_do = b ? blk_queue_discard(b) : true; + blk_queue_discard_granularity(q, 512); + q->limits.max_discard_sectors = drbd_max_discard_sectors(connection); + q->limits.max_write_zeroes_sectors = + drbd_max_discard_sectors(connection); + return; - if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) { - can_do = false; - drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); - } - if (can_do) { - /* We don't care for the granularity, really. - * Stacking limits below should fix it for the local - * device. Whether or not it is a suitable granularity - * on the remote device is not our problem, really. If - * you care, you need to use devices with similar - * topology on all peers. */ - blk_queue_discard_granularity(q, 512); - q->limits.max_discard_sectors = drbd_max_discard_sectors(connection); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); - q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection); - } else { - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); - blk_queue_discard_granularity(q, 0); - q->limits.max_discard_sectors = 0; - q->limits.max_write_zeroes_sectors = 0; - } -} - -static void fixup_discard_if_not_supported(struct request_queue *q) -{ - /* To avoid confusion, if this queue does not support discard, clear - * max_discard_sectors, which is what lsblk -D reports to the user. - * Older kernels got this wrong in "stack limits". - * */ - if (!blk_queue_discard(q)) { - blk_queue_max_discard_sectors(q, 0); - blk_queue_discard_granularity(q, 0); - } +not_supported: + blk_queue_discard_granularity(q, 0); + q->limits.max_discard_sectors = 0; + q->limits.max_write_zeroes_sectors = 0; } static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) @@ -1273,7 +1264,6 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi unsigned int max_segments = 0; struct request_queue *b = NULL; struct disk_conf *dc; - bool discard_zeroes_if_aligned = true; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; @@ -1282,7 +1272,6 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi rcu_read_lock(); dc = rcu_dereference(device->ldev->disk_conf); max_segments = dc->max_bio_bvecs; - discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned; rcu_read_unlock(); blk_set_stacking_limits(&q->limits); @@ -1292,13 +1281,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi /* This is the workaround for "bio would need to, but cannot, be split" */ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_SIZE-1); - decide_on_discard_support(device, q, b, discard_zeroes_if_aligned); + decide_on_discard_support(device, bdev); if (b) { blk_stack_limits(&q->limits, &b->limits, 0); disk_update_readahead(device->vdisk); } - fixup_discard_if_not_supported(q); fixup_write_zeroes(device, q); } @@ -1437,14 +1425,14 @@ static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b) static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf, struct drbd_backing_dev *nbc) { - struct request_queue * const q = nbc->backing_bdev->bd_disk->queue; + struct block_device *bdev = nbc->backing_bdev; if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; if (disk_conf->al_extents > drbd_al_extents_max(nbc)) disk_conf->al_extents = drbd_al_extents_max(nbc); - if (!blk_queue_discard(q)) { + if (!bdev_max_discard_sectors(bdev)) { if (disk_conf->rs_discard_granularity) { disk_conf->rs_discard_granularity = 0; /* disable feature */ drbd_info(device, "rs_discard_granularity feature disabled\n"); @@ -1453,16 +1441,19 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis if (disk_conf->rs_discard_granularity) { int orig_value = disk_conf->rs_discard_granularity; + sector_t discard_size = bdev_max_discard_sectors(bdev) << 9; + unsigned int discard_granularity = bdev_discard_granularity(bdev); int remainder; - if (q->limits.discard_granularity > disk_conf->rs_discard_granularity) - disk_conf->rs_discard_granularity = q->limits.discard_granularity; + if (discard_granularity > disk_conf->rs_discard_granularity) + disk_conf->rs_discard_granularity = discard_granularity; - remainder = disk_conf->rs_discard_granularity % q->limits.discard_granularity; + remainder = disk_conf->rs_discard_granularity % + discard_granularity; disk_conf->rs_discard_granularity += remainder; - if (disk_conf->rs_discard_granularity > q->limits.max_discard_sectors << 9) - disk_conf->rs_discard_granularity = q->limits.max_discard_sectors << 9; + if (disk_conf->rs_discard_granularity > discard_size) + disk_conf->rs_discard_granularity = discard_size; if (disk_conf->rs_discard_granularity != orig_value) drbd_info(device, "rs_discard_granularity changed to %d\n", @@ -1611,8 +1602,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) drbd_send_sync_param(peer_device); } - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); kfree(old_plan); mod_timer(&device->request_timer, jiffies + HZ); goto success; @@ -2443,8 +2433,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&connection->resource->conf_update); mutex_unlock(&connection->data.mutex); - synchronize_rcu(); - kfree(old_net_conf); + kvfree_rcu(old_net_conf); if (connection->cstate >= C_WF_REPORT_PARAMS) { struct drbd_peer_device *peer_device; @@ -2502,6 +2491,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) struct drbd_resource *resource; struct drbd_connection *connection; enum drbd_ret_code retcode; + enum drbd_state_rv rv; int i; int err; @@ -2621,12 +2611,11 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } rcu_read_unlock(); - retcode = (enum drbd_ret_code)conn_request_state(connection, - NS(conn, C_UNCONNECTED), CS_VERBOSE); + rv = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); mutex_unlock(&adm_ctx.resource->adm_mutex); - drbd_adm_finish(&adm_ctx, info, retcode); + drbd_adm_finish(&adm_ctx, info, rv); return 0; fail: @@ -2734,11 +2723,12 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); - if (rv < SS_SUCCESS) - retcode = (enum drbd_ret_code)rv; - else - retcode = NO_ERROR; mutex_unlock(&adm_ctx.resource->adm_mutex); + if (rv < SS_SUCCESS) { + drbd_adm_finish(&adm_ctx, info, rv); + return 0; + } + retcode = NO_ERROR; fail: drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -2857,8 +2847,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) new_disk_conf->disk_size = (sector_t)rs.resize_size; rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&device->resource->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); new_disk_conf = NULL; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 08da922f81d1..6762be53f409 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -364,7 +364,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; struct page *page = NULL; - unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned int nr_pages = PFN_UP(payload_size); if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) return NULL; @@ -1511,7 +1511,6 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags) { struct block_device *bdev = device->ldev->backing_bdev; - struct request_queue *q = bdev_get_queue(bdev); sector_t tmp, nr; unsigned int max_discard_sectors, granularity; int alignment; @@ -1521,10 +1520,10 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u goto zero_out; /* Zero-sector (unknown) and one-sector granularities are the same. */ - granularity = max(q->limits.discard_granularity >> 9, 1U); + granularity = max(bdev_discard_granularity(bdev) >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; - max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); + max_discard_sectors = min(bdev_max_discard_sectors(bdev), (1U << 22)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) goto zero_out; @@ -1548,7 +1547,8 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u start = tmp; } while (nr_sectors >= max_discard_sectors) { - err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0); + err |= blkdev_issue_discard(bdev, start, max_discard_sectors, + GFP_NOIO); nr_sectors -= max_discard_sectors; start += max_discard_sectors; } @@ -1560,7 +1560,7 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u nr = nr_sectors; nr -= (unsigned int)nr % granularity; if (nr) { - err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO); nr_sectors -= nr; start += nr; } @@ -1575,11 +1575,10 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u static bool can_do_reliable_discards(struct drbd_device *device) { - struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); struct disk_conf *dc; bool can_do; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(device->ldev->backing_bdev)) return false; rcu_read_lock(); @@ -1629,9 +1628,9 @@ int drbd_submit_peer_request(struct drbd_device *device, struct bio *bio; struct page *page = peer_req->pages; sector_t sector = peer_req->i.sector; - unsigned data_size = peer_req->i.size; - unsigned n_bios = 0; - unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned int data_size = peer_req->i.size; + unsigned int n_bios = 0; + unsigned int nr_pages = PFN_UP(data_size); /* TRIM/DISCARD: for now, always use the helper function * blkdev_issue_zeroout(..., discard=true). @@ -3751,8 +3750,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in drbd_info(connection, "peer data-integrity-alg: %s\n", integrity_alg[0] ? integrity_alg : "(none)"); - synchronize_rcu(); - kfree(old_net_conf); + kvfree_rcu(old_net_conf); return 0; disconnect_rcu_unlock: @@ -3903,7 +3901,6 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i drbd_err(device, "verify-alg of wrong size, " "peer wants %u, accepting only up to %u byte\n", data_size, SHARED_SECRET_MAX); - err = -EIO; goto reconnect; } @@ -4121,8 +4118,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&connection->resource->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu(old_disk_conf); drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", (unsigned long)p_usize, (unsigned long)my_usize); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 75be0e16770a..e64bcfba30ef 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -922,7 +922,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se switch (rbm) { case RB_CONGESTED_REMOTE: - return 0; + return false; case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 4ee11aef6672..3f7bf9f2d874 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -2071,8 +2071,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) conn_free_crypto(connection); mutex_unlock(&connection->resource->conf_update); - synchronize_rcu(); - kfree(old_conf); + kvfree_rcu(old_conf); } if (ns_max.susp_fen) { diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0f9956f4e9c4..af3051dd8912 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1030,7 +1030,7 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_ { if (drbd_peer_req_has_active_page(peer_req)) { /* This might happen if sendpage() has not finished */ - int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; + int i = PFN_UP(peer_req->i.size); atomic_add(i, &device->pp_in_use_by_net); atomic_sub(i, &device->pp_in_use); spin_lock_irq(&device->resource->req_lock); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a58595f5ee2c..e2cb51810e89 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1,54 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * linux/drivers/block/loop.c - * - * Written by Theodore Ts'o, 3/29/93 - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - * - * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 - * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 - * - * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 - * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 - * - * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 - * - * Added devfs support - Richard Gooch 16-Jan-1998 - * - * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 - * - * Loadable modules and other fixes by AK, 1998 - * - * Make real block number available to downstream transfer functions, enables - * CBC (and relatives) mode encryption requiring unique IVs per data block. - * Reed H. Petty, rhp@draper.net - * - * Maximum number of loop devices now dynamic via max_loop module parameter. - * Russell Kroll 19990701 - * - * Maximum number of loop devices when compiled-in now selectable by passing - * max_loop=<1-255> to the kernel on boot. - * Erik I. Bolsø, , Oct 31, 1999 - * - * Completely rewrite request handling to be make_request_fn style and - * non blocking, pushing work to a helper thread. Lots of fixes from - * Al Viro too. - * Jens Axboe , Nov 2000 - * - * Support up to 256 loop devices - * Heinz Mauelshagen , Feb 2002 - * - * Support for falling back on the write file operation when the address space - * operations write_begin is not available on the backing filesystem. - * Anton Altaparmakov, 16 Feb 2005 - * - * Still To Fix: - * - Advisory locking is ignored here. - * - Should use an own CAP_* category instead of CAP_SYS_ADMIN - * + * Copyright 1993 by Theodore Ts'o. */ - #include #include #include @@ -59,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -80,10 +32,62 @@ #include #include #include - -#include "loop.h" - #include +#include +#include +#include + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, + Lo_deleting, +}; + +struct loop_func_table; + +struct loop_device { + int lo_number; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + char lo_file_name[LO_NAME_SIZE]; + + struct file * lo_backing_file; + struct block_device *lo_device; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; + struct mutex lo_mutex; + bool idr_visible; +}; + +struct loop_cmd { + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; +}; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH (128) @@ -314,15 +318,12 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, mode |= FALLOC_FL_KEEP_SIZE; - if (!blk_queue_discard(lo->lo_queue)) { - ret = -EOPNOTSUPP; - goto out; - } + if (!bdev_max_discard_sectors(lo->lo_device)) + return -EOPNOTSUPP; ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) - ret = -EIO; - out: + return -EIO; return ret; } @@ -572,6 +573,10 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (!file) return -EBADF; + + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + is_loop = is_loop_device(file); error = loop_global_lock_killable(lo, is_loop); if (error) @@ -626,13 +631,18 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, fput(old_file); if (partscan) loop_reread_partitions(lo); - return 0; + + error = 0; +done: + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + return error; out_err: loop_global_unlock(lo, is_loop); out_putf: fput(file); - return error; + goto done; } /* loop sysfs attributes */ @@ -762,7 +772,7 @@ static void loop_config_discard(struct loop_device *lo) struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); max_discard_sectors = backingq->limits.max_write_zeroes_sectors; - granularity = backingq->limits.discard_granularity ?: + granularity = bdev_discard_granularity(I_BDEV(inode)) ?: queue_physical_block_size(backingq); /* @@ -787,14 +797,11 @@ static void loop_config_discard(struct loop_device *lo) q->limits.discard_granularity = granularity; blk_queue_max_discard_sectors(q, max_discard_sectors); blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } else { q->limits.discard_granularity = 0; blk_queue_max_discard_sectors(q, 0); blk_queue_max_write_zeroes_sectors(q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); } - q->limits.discard_alignment = 0; } struct loop_worker { @@ -808,8 +815,6 @@ struct loop_worker { }; static void loop_workfn(struct work_struct *work); -static void loop_rootcg_workfn(struct work_struct *work); -static void loop_free_idle_workers(struct timer_list *timer); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) @@ -893,6 +898,39 @@ queue_work: spin_unlock_irq(&lo->lo_work_lock); } +static void loop_set_timer(struct loop_device *lo) +{ + timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); +} + +static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) +{ + struct loop_worker *pos, *worker; + + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + if (!delete_all && + time_is_after_jiffies(worker->last_ran_at + + LOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); + } + if (!list_empty(&lo->idle_worker_list)) + loop_set_timer(lo); + spin_unlock_irq(&lo->lo_work_lock); +} + +static void loop_free_idle_workers_timer(struct timer_list *timer) +{ + struct loop_device *lo = container_of(timer, struct loop_device, timer); + + return loop_free_idle_workers(lo, false); +} + static void loop_update_rotational(struct loop_device *lo) { struct file *file = lo->lo_backing_file; @@ -903,7 +941,7 @@ static void loop_update_rotational(struct loop_device *lo) /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ if (file_bdev) - nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev)); + nonrot = bdev_nonrot(file_bdev); if (nonrot) blk_queue_flag_set(QUEUE_FLAG_NONROT, q); @@ -967,6 +1005,9 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. @@ -1011,24 +1052,19 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; - lo->workqueue = alloc_workqueue("loop%d", - WQ_UNBOUND | WQ_FREEZABLE, - 0, - lo->lo_number); if (!lo->workqueue) { - error = -ENOMEM; - goto out_unlock; + lo->workqueue = alloc_workqueue("loop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, lo->lo_number); + if (!lo->workqueue) { + error = -ENOMEM; + goto out_unlock; + } } disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); - INIT_LIST_HEAD(&lo->rootcg_cmd_list); - INIT_LIST_HEAD(&lo->idle_worker_list); - lo->worker_tree = RB_ROOT; - timer_setup(&lo->timer, loop_free_idle_workers, - TIMER_DEFERRABLE); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; @@ -1073,7 +1109,12 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, loop_reread_partitions(lo); if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); - return 0; + + error = 0; +done: + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + return error; out_unlock: loop_global_unlock(lo, is_loop); @@ -1084,53 +1125,24 @@ out_putf: fput(file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - return error; + goto done; } static void __loop_clr_fd(struct loop_device *lo, bool release) { struct file *filp; gfp_t gfp = lo->old_gfp_mask; - struct loop_worker *pos, *worker; - - /* - * Flush loop_configure() and loop_change_fd(). It is acceptable for - * loop_validate_file() to succeed, for actual clear operation has not - * started yet. - */ - mutex_lock(&loop_validate_mutex); - mutex_unlock(&loop_validate_mutex); - /* - * loop_validate_file() now fails because l->lo_state != Lo_bound - * became visible. - */ - - /* - * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close() - * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if - * lo->lo_state has changed while waiting for lo->lo_mutex. - */ - mutex_lock(&lo->lo_mutex); - BUG_ON(lo->lo_state != Lo_rundown); - mutex_unlock(&lo->lo_mutex); if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); - /* freeze request queue during the transition */ - blk_mq_freeze_queue(lo->lo_queue); - - destroy_workqueue(lo->workqueue); - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - spin_unlock_irq(&lo->lo_work_lock); - del_timer_sync(&lo->timer); + /* + * Freeze the request queue when unbinding on a live file descriptor and + * thus an open device. When called from ->release we are guaranteed + * that there is no I/O in progress already. + */ + if (!release) + blk_mq_freeze_queue(lo->lo_queue); spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; @@ -1151,7 +1163,8 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - blk_mq_unfreeze_queue(lo->lo_queue); + if (!release) + blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); @@ -1202,11 +1215,20 @@ static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&lo->lo_mutex); + /* + * Since lo_ioctl() is called without locks held, it is possible that + * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. + * + * Therefore, use global lock when setting Lo_rundown state in order to + * make sure that loop_validate_file() will fail if the "struct file" + * which loop_configure()/loop_change_fd() found via fget() was this + * loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* @@ -1219,13 +1241,13 @@ static int loop_clr_fd(struct loop_device *lo) * /do something like mkfs/losetup -d causing the losetup -d * command to fail with EBUSY. */ - if (atomic_read(&lo->lo_refcnt) > 1) { + if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return 0; } lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); __loop_clr_fd(lo, false); return 0; @@ -1257,15 +1279,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); - if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { - /* If any pages were dirtied after invalidate_bdev(), try again */ - err = -EAGAIN; - pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - prev_lo_flags = lo->lo_flags; err = loop_set_status_from_info(lo, info); @@ -1476,21 +1489,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - - /* invalidate_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - blk_queue_logical_block_size(lo->lo_queue, arg); blk_queue_physical_block_size(lo->lo_queue, arg); blk_queue_io_min(lo->lo_queue, arg); loop_update_dio(lo); -out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); return err; @@ -1720,33 +1722,15 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static int lo_open(struct block_device *bdev, fmode_t mode) -{ - struct loop_device *lo = bdev->bd_disk->private_data; - int err; - - err = mutex_lock_killable(&lo->lo_mutex); - if (err) - return err; - if (lo->lo_state == Lo_deleting) - err = -ENXIO; - else - atomic_inc(&lo->lo_refcnt); - mutex_unlock(&lo->lo_mutex); - return err; -} - static void lo_release(struct gendisk *disk, fmode_t mode) { struct loop_device *lo = disk->private_data; - mutex_lock(&lo->lo_mutex); - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + if (disk_openers(disk) > 0) + return; - if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; + mutex_lock(&lo->lo_mutex); + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); /* @@ -1755,27 +1739,30 @@ static void lo_release(struct gendisk *disk, fmode_t mode) */ __loop_clr_fd(lo, true); return; - } else if (lo->lo_state == Lo_bound) { - /* - * Otherwise keep thread (if running) and config, - * but flush possible ongoing bios in thread. - */ - blk_mq_freeze_queue(lo->lo_queue); - blk_mq_unfreeze_queue(lo->lo_queue); } - -out_unlock: mutex_unlock(&lo->lo_mutex); } +static void lo_free_disk(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + if (lo->workqueue) + destroy_workqueue(lo->workqueue); + loop_free_idle_workers(lo, true); + del_timer_sync(&lo->timer); + mutex_destroy(&lo->lo_mutex); + kfree(lo); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, - .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif + .free_disk = lo_free_disk, }; /* @@ -1834,12 +1821,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->blkcg_css = NULL; cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (rq->bio && rq->bio->bi_blkg) { - cmd->blkcg_css = &bio_blkcg(rq->bio)->css; + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); #ifdef CONFIG_MEMCG - cmd->memcg_css = - cgroup_get_e_css(cmd->blkcg_css->cgroup, - &memory_cgrp_subsys); + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } #endif } #endif @@ -1888,11 +1877,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } } -static void loop_set_timer(struct loop_device *lo) -{ - timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); -} - static void loop_process_work(struct loop_worker *worker, struct list_head *cmd_list, struct loop_device *lo) { @@ -1941,27 +1925,6 @@ static void loop_rootcg_workfn(struct work_struct *work) loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } -static void loop_free_idle_workers(struct timer_list *timer) -{ - struct loop_device *lo = container_of(timer, struct loop_device, timer); - struct loop_worker *pos, *worker; - - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - if (time_is_after_jiffies(worker->last_ran_at + - LOOP_IDLE_WORKER_TIMEOUT)) - break; - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - if (!list_empty(&lo->idle_worker_list)) - loop_set_timer(lo); - spin_unlock_irq(&lo->lo_work_lock); -} - static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .complete = lo_complete_rq, @@ -1977,6 +1940,9 @@ static int loop_add(int i) lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; + lo->worker_tree = RB_ROOT; + INIT_LIST_HEAD(&lo->idle_worker_list); + timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->lo_state = Lo_unbound; err = mutex_lock_killable(&loop_ctl_mutex); @@ -2046,11 +2012,12 @@ static int loop_add(int i) */ if (!part_shift) disk->flags |= GENHD_FL_NO_PART; - atomic_set(&lo->lo_refcnt, 0); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); spin_lock_init(&lo->lo_work_lock); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->minors = 1 << part_shift; @@ -2090,15 +2057,14 @@ static void loop_remove(struct loop_device *lo) { /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); - blk_cleanup_disk(lo->lo_disk); + blk_cleanup_queue(lo->lo_disk->queue); blk_mq_free_tag_set(&lo->tag_set); mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); mutex_unlock(&loop_ctl_mutex); - /* There is no route which can find this loop device. */ - mutex_destroy(&lo->lo_mutex); - kfree(lo); + + put_disk(lo->lo_disk); } static void loop_probe(dev_t dev) @@ -2137,13 +2103,12 @@ static int loop_control_remove(int idx) ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; - if (lo->lo_state != Lo_unbound || - atomic_read(&lo->lo_refcnt) > 0) { + if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; goto mark_visible; } - /* Mark this loop device no longer open()-able. */ + /* Mark this loop device as no more bound, but not quite unbound yet */ lo->lo_state = Lo_deleting; mutex_unlock(&lo->lo_mutex); diff --git a/drivers/block/loop.h b/drivers/block/loop.h deleted file mode 100644 index 082d4b6bfc6a..000000000000 --- a/drivers/block/loop.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * loop.h - * - * Written by Theodore Ts'o, 3/29/93. - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - */ -#ifndef _LINUX_LOOP_H -#define _LINUX_LOOP_H - -#include -#include -#include -#include -#include -#include - -/* Possible states of device */ -enum { - Lo_unbound, - Lo_bound, - Lo_rundown, - Lo_deleting, -}; - -struct loop_func_table; - -struct loop_device { - int lo_number; - atomic_t lo_refcnt; - loff_t lo_offset; - loff_t lo_sizelimit; - int lo_flags; - char lo_file_name[LO_NAME_SIZE]; - - struct file * lo_backing_file; - struct block_device *lo_device; - - gfp_t old_gfp_mask; - - spinlock_t lo_lock; - int lo_state; - spinlock_t lo_work_lock; - struct workqueue_struct *workqueue; - struct work_struct rootcg_work; - struct list_head rootcg_cmd_list; - struct list_head idle_worker_list; - struct rb_root worker_tree; - struct timer_list timer; - bool use_dio; - bool sysfs_inited; - - struct request_queue *lo_queue; - struct blk_mq_tag_set tag_set; - struct gendisk *lo_disk; - struct mutex lo_mutex; - bool idr_visible; -}; - -struct loop_cmd { - struct list_head list_entry; - bool use_aio; /* use AIO interface to handle I/O */ - atomic_t ref; /* only for aio */ - long ret; - struct kiocb iocb; - struct bio_vec *bvec; - struct cgroup_subsys_state *blkcg_css; - struct cgroup_subsys_state *memcg_css; -}; - -#endif diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4fbaf0b4958b..27386a572ba4 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2729,7 +2729,7 @@ static int mtip_dma_alloc(struct driver_data *dd) { struct mtip_port *port = dd->port; - /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ + /* Allocate dma memory for RX Fis, Identify, and Sector Buffer */ port->block1 = dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, &port->block1_dma, GFP_KERNEL); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 5a1f98494ddd..ac8b045c777c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -333,7 +333,6 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, if (nbd->config->flags & NBD_FLAG_SEND_TRIM) { nbd->disk->queue->limits.discard_granularity = blksize; - nbd->disk->queue->limits.discard_alignment = blksize; blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); } blk_queue_logical_block_size(nbd->disk->queue, blksize); @@ -947,11 +946,15 @@ static int wait_for_reconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; if (!config->dead_conn_timeout) return 0; - if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) + + if (!wait_event_timeout(config->conn_wait, + test_bit(NBD_RT_DISCONNECTED, + &config->runtime_flags) || + atomic_read(&config->live_connections) > 0, + config->dead_conn_timeout)) return 0; - return wait_event_timeout(config->conn_wait, - atomic_read(&config->live_connections) > 0, - config->dead_conn_timeout) > 0; + + return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); } static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) @@ -1217,11 +1220,11 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) return -ENOSPC; } -static void nbd_bdev_reset(struct block_device *bdev) +static void nbd_bdev_reset(struct nbd_device *nbd) { - if (bdev->bd_openers > 1) + if (disk_openers(nbd->disk) > 1) return; - set_capacity(bdev->bd_disk, 0); + set_capacity(nbd->disk, 0); } static void nbd_parse_flags(struct nbd_device *nbd) @@ -1231,8 +1234,6 @@ static void nbd_parse_flags(struct nbd_device *nbd) set_disk_ro(nbd->disk, true); else set_disk_ro(nbd->disk, false); - if (config->flags & NBD_FLAG_SEND_TRIM) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue); if (config->flags & NBD_FLAG_SEND_FLUSH) { if (config->flags & NBD_FLAG_SEND_FUA) blk_queue_write_cache(nbd->disk->queue, true, true); @@ -1318,9 +1319,7 @@ static void nbd_config_put(struct nbd_device *nbd) nbd->tag_set.timeout = 0; nbd->disk->queue->limits.discard_granularity = 0; - nbd->disk->queue->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue); + blk_queue_max_discard_sectors(nbd->disk->queue, 0); mutex_unlock(&nbd->config_lock); nbd_put(nbd); @@ -1389,7 +1388,7 @@ static int nbd_start_device(struct nbd_device *nbd) return nbd_set_size(nbd, config->bytesize, nbd_blksize(config)); } -static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev) +static int nbd_start_device_ioctl(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; int ret; @@ -1408,7 +1407,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b flush_workqueue(nbd->recv_workq); mutex_lock(&nbd->config_lock); - nbd_bdev_reset(bdev); + nbd_bdev_reset(nbd); /* user requested, ignore socket errors */ if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; @@ -1422,7 +1421,7 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, { sock_shutdown(nbd); __invalidate_device(bdev, true); - nbd_bdev_reset(bdev); + nbd_bdev_reset(nbd); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); @@ -1468,7 +1467,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, config->flags = arg; return 0; case NBD_DO_IT: - return nbd_start_device_ioctl(nbd, bdev); + return nbd_start_device_ioctl(nbd); case NBD_CLEAR_QUE: /* * This is for compatibility only. The queue is always cleared @@ -1579,7 +1578,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) struct nbd_device *nbd = disk->private_data; if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && - disk->part0->bd_openers == 0) + disk_openers(disk) == 0) nbd_disconnect_and_put(nbd); nbd_config_put(nbd); @@ -1784,7 +1783,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 0; - disk->queue->limits.discard_alignment = 0; blk_queue_max_discard_sectors(disk->queue, 0); blk_queue_max_segment_size(disk->queue, UINT_MAX); blk_queue_max_segments(disk->queue, USHRT_MAX); @@ -2082,6 +2080,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) mutex_lock(&nbd->config_lock); nbd_disconnect(nbd); sock_shutdown(nbd); + wake_up(&nbd->config->conn_wait); /* * Make sure recv thread has finished, we can safely call nbd_clear_que() * to cancel the inflight I/Os. diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index c441a4972064..539cfeac263d 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -11,6 +11,9 @@ #include #include "null_blk.h" +#undef pr_fmt +#define pr_fmt(fmt) "null_blk: " fmt + #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL @@ -232,6 +235,7 @@ static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); +static struct nullb *null_find_dev_by_name(const char *name); static void null_free_device_storage(struct nullb_device *dev, bool is_cache); static inline struct nullb_device *to_nullb_device(struct config_item *item) @@ -560,6 +564,9 @@ config_item *nullb_group_make_item(struct config_group *group, const char *name) { struct nullb_device *dev; + if (null_find_dev_by_name(name)) + return ERR_PTR(-EEXIST); + dev = null_alloc_dev(); if (!dev) return ERR_PTR(-ENOMEM); @@ -1765,9 +1772,7 @@ static void null_config_discard(struct nullb *nullb) } nullb->q->limits.discard_granularity = nullb->dev->blocksize; - nullb->q->limits.discard_alignment = nullb->dev->blocksize; blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); } static const struct block_device_operations null_bio_ops = { @@ -2061,7 +2066,13 @@ static int null_add_dev(struct nullb_device *dev) null_config_discard(nullb); - sprintf(nullb->disk_name, "nullb%d", nullb->index); + if (config_item_name(&dev->item)) { + /* Use configfs dir name as the device name */ + snprintf(nullb->disk_name, sizeof(nullb->disk_name), + "%s", config_item_name(&dev->item)); + } else { + sprintf(nullb->disk_name, "nullb%d", nullb->index); + } rv = null_gendisk_register(nullb); if (rv) @@ -2071,6 +2082,8 @@ static int null_add_dev(struct nullb_device *dev) list_add_tail(&nullb->list, &nullb_list); mutex_unlock(&lock); + pr_info("disk %s created\n", nullb->disk_name); + return 0; out_cleanup_zone: null_free_zoned_dev(dev); @@ -2088,12 +2101,53 @@ out: return rv; } +static struct nullb *null_find_dev_by_name(const char *name) +{ + struct nullb *nullb = NULL, *nb; + + mutex_lock(&lock); + list_for_each_entry(nb, &nullb_list, list) { + if (strcmp(nb->disk_name, name) == 0) { + nullb = nb; + break; + } + } + mutex_unlock(&lock); + + return nullb; +} + +static int null_create_dev(void) +{ + struct nullb_device *dev; + int ret; + + dev = null_alloc_dev(); + if (!dev) + return -ENOMEM; + + ret = null_add_dev(dev); + if (ret) { + null_free_dev(dev); + return ret; + } + + return 0; +} + +static void null_destroy_dev(struct nullb *nullb) +{ + struct nullb_device *dev = nullb->dev; + + null_del_dev(nullb); + null_free_dev(dev); +} + static int __init null_init(void) { int ret = 0; unsigned int i; struct nullb *nullb; - struct nullb_device *dev; if (g_bs > PAGE_SIZE) { pr_warn("invalid block size\n"); @@ -2113,19 +2167,21 @@ static int __init null_init(void) } if (g_queue_mode == NULL_Q_RQ) { - pr_err("legacy IO path no longer available\n"); + pr_err("legacy IO path is no longer available\n"); return -EINVAL; } + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("submit_queues param is set to %u.\n", - nr_online_nodes); + nr_online_nodes); g_submit_queues = nr_online_nodes; } - } else if (g_submit_queues > nr_cpu_ids) + } else if (g_submit_queues > nr_cpu_ids) { g_submit_queues = nr_cpu_ids; - else if (g_submit_queues <= 0) + } else if (g_submit_queues <= 0) { g_submit_queues = 1; + } if (g_queue_mode == NULL_Q_MQ && shared_tags) { ret = null_init_tag_set(NULL, &tag_set); @@ -2149,16 +2205,9 @@ static int __init null_init(void) } for (i = 0; i < nr_devices; i++) { - dev = null_alloc_dev(); - if (!dev) { - ret = -ENOMEM; + ret = null_create_dev(); + if (ret) goto err_dev; - } - ret = null_add_dev(dev); - if (ret) { - null_free_dev(dev); - goto err_dev; - } } pr_info("module loaded\n"); @@ -2167,9 +2216,7 @@ static int __init null_init(void) err_dev: while (!list_empty(&nullb_list)) { nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); + null_destroy_dev(nullb); } unregister_blkdev(null_major, "nullb"); err_conf: @@ -2190,12 +2237,8 @@ static void __exit null_exit(void) mutex_lock(&lock); while (!list_empty(&nullb_list)) { - struct nullb_device *dev; - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); + null_destroy_dev(nullb); } mutex_unlock(&lock); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 78eb56b0ca55..4525a65e1b23 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -16,13 +16,15 @@ #include struct nullb_cmd { - struct request *rq; - struct bio *bio; + union { + struct request *rq; + struct bio *bio; + }; unsigned int tag; blk_status_t error; + bool fake_timeout; struct nullb_queue *nq; struct hrtimer timer; - bool fake_timeout; }; struct nullb_queue { diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index dae54dd1aeac..ed158ea4fdd1 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -6,6 +6,9 @@ #define CREATE_TRACE_POINTS #include "trace.h" +#undef pr_fmt +#define pr_fmt(fmt) "null_blk: " fmt + static inline sector_t mb_to_sects(unsigned long mb) { return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; @@ -75,8 +78,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) dev->zone_capacity = dev->zone_size; if (dev->zone_capacity > dev->zone_size) { - pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", - dev->zone_capacity, dev->zone_size); + pr_err("zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); return -EINVAL; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 86c8794ede41..789093375344 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -12,7 +12,7 @@ * Theory of operation: * * At the lowest level, there is the standard driver for the CD/DVD device, - * typically ide-cd.c or sr.c. This driver can handle read and write requests, + * such as drivers/scsi/sr.c. This driver can handle read and write requests, * but it doesn't know anything about the special restrictions that apply to * packet writing. One restriction is that write requests must be aligned to * packet boundaries on the physical media, and the size of a write request @@ -522,7 +522,7 @@ static struct packet_data *pkt_alloc_packet_data(int frames) goto no_pkt; pkt->frames = frames; - pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames); + pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL); if (!pkt->w_bio) goto no_bio; @@ -536,27 +536,21 @@ static struct packet_data *pkt_alloc_packet_data(int frames) bio_list_init(&pkt->orig_bios); for (i = 0; i < frames; i++) { - struct bio *bio = bio_kmalloc(GFP_KERNEL, 1); - if (!bio) + pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL); + if (!pkt->r_bios[i]) goto no_rd_bio; - - pkt->r_bios[i] = bio; } return pkt; no_rd_bio: - for (i = 0; i < frames; i++) { - struct bio *bio = pkt->r_bios[i]; - if (bio) - bio_put(bio); - } - + for (i = 0; i < frames; i++) + kfree(pkt->r_bios[i]); no_page: for (i = 0; i < frames / FRAMES_PER_PAGE; i++) if (pkt->pages[i]) __free_page(pkt->pages[i]); - bio_put(pkt->w_bio); + kfree(pkt->w_bio); no_bio: kfree(pkt); no_pkt: @@ -570,14 +564,11 @@ static void pkt_free_packet_data(struct packet_data *pkt) { int i; - for (i = 0; i < pkt->frames; i++) { - struct bio *bio = pkt->r_bios[i]; - if (bio) - bio_put(bio); - } + for (i = 0; i < pkt->frames; i++) + kfree(pkt->r_bios[i]); for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++) __free_page(pkt->pages[i]); - bio_put(pkt->w_bio); + kfree(pkt->w_bio); kfree(pkt); } @@ -951,6 +942,7 @@ static void pkt_end_io_read(struct bio *bio) if (bio->bi_status) atomic_inc(&pkt->io_errors); + bio_uninit(bio); if (atomic_dec_and_test(&pkt->io_wait)) { atomic_inc(&pkt->run_sm); wake_up(&pd->wqueue); @@ -968,6 +960,7 @@ static void pkt_end_io_packet_write(struct bio *bio) pd->stats.pkt_ended++; + bio_uninit(bio); pkt_bio_finished(pd); atomic_dec(&pkt->io_wait); atomic_inc(&pkt->run_sm); @@ -1022,7 +1015,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) continue; bio = pkt->r_bios[f]; - bio_reset(bio, pd->bdev, REQ_OP_READ); + bio_init(bio, pd->bdev, bio->bi_inline_vecs, 1, REQ_OP_READ); bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); bio->bi_end_io = pkt_end_io_read; bio->bi_private = pkt; @@ -1235,7 +1228,8 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) { int f; - bio_reset(pkt->w_bio, pd->bdev, REQ_OP_WRITE); + bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames, + REQ_OP_WRITE); pkt->w_bio->bi_iter.bi_sector = pkt->sector; pkt->w_bio->bi_end_io = pkt_end_io_packet_write; pkt->w_bio->bi_private = pkt; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b844432bad20..2b21f717cce1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4942,7 +4942,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) blk_queue_io_opt(q, rbd_dev->opts->alloc_size); if (rbd_dev->opts->trim) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); q->limits.discard_granularity = rbd_dev->opts->alloc_size; blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index b66e8840b94b..409c76b81aed 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -25,6 +25,7 @@ static int rnbd_client_major; static DEFINE_IDA(index_ida); static DEFINE_MUTEX(sess_lock); static LIST_HEAD(sess_list); +static struct workqueue_struct *rnbd_clt_wq; /* * Maximum number of partitions an instance can have. @@ -1364,11 +1365,9 @@ static void setup_request_queue(struct rnbd_clt_dev *dev) blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors); dev->queue->limits.discard_granularity = dev->discard_granularity; dev->queue->limits.discard_alignment = dev->discard_alignment; - if (dev->max_discard_sectors) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, dev->queue); if (dev->secure_discard) - blk_queue_flag_set(QUEUE_FLAG_SECERASE, dev->queue); - + blk_queue_max_secure_erase_sectors(dev->queue, + dev->max_discard_sectors); blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); blk_queue_max_segments(dev->queue, dev->max_segments); @@ -1761,12 +1760,12 @@ static void rnbd_destroy_sessions(void) * procedure takes minutes. */ INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work); - queue_work(system_long_wq, &dev->unmap_on_rmmod_work); + queue_work(rnbd_clt_wq, &dev->unmap_on_rmmod_work); } rnbd_clt_put_sess(sess); } /* Wait for all scheduled unmap works */ - flush_workqueue(system_long_wq); + flush_workqueue(rnbd_clt_wq); WARN_ON(!list_empty(&sess_list)); } @@ -1791,6 +1790,14 @@ static int __init rnbd_client_init(void) pr_err("Failed to load module, creating sysfs device files failed, err: %d\n", err); unregister_blkdev(rnbd_client_major, "rnbd"); + return err; + } + rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0); + if (!rnbd_clt_wq) { + pr_err("Failed to load module, alloc_workqueue failed.\n"); + rnbd_clt_destroy_sysfs_files(); + unregister_blkdev(rnbd_client_major, "rnbd"); + err = -ENOMEM; } return err; @@ -1801,6 +1808,7 @@ static void __exit rnbd_client_exit(void) rnbd_destroy_sessions(); unregister_blkdev(rnbd_client_major, "rnbd"); ida_destroy(&index_ida); + destroy_workqueue(rnbd_clt_wq); } module_init(rnbd_client_init); diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h index 2c3df02b5e8e..4309e5252469 100644 --- a/drivers/block/rnbd/rnbd-srv-dev.h +++ b/drivers/block/rnbd/rnbd-srv-dev.h @@ -44,16 +44,12 @@ static inline int rnbd_dev_get_max_hw_sects(const struct rnbd_dev *dev) static inline int rnbd_dev_get_secure_discard(const struct rnbd_dev *dev) { - return blk_queue_secure_erase(bdev_get_queue(dev->bdev)); + return bdev_max_secure_erase_sectors(dev->bdev); } static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev) { - if (!blk_queue_discard(bdev_get_queue(dev->bdev))) - return 0; - - return blk_queue_get_max_sectors(bdev_get_queue(dev->bdev), - REQ_OP_DISCARD); + return bdev_max_discard_sectors(dev->bdev); } static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) @@ -63,7 +59,7 @@ static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) { - return bdev_get_queue(dev->bdev)->limits.discard_alignment; + return bdev_discard_alignment(dev->bdev); } #endif /* RNBD_SRV_DEV_H */ diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index f04df6294650..beaef43a67b9 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -533,7 +533,6 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, struct rnbd_srv_sess_dev *sess_dev) { struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; - struct request_queue *q = bdev_get_queue(rnbd_dev->bdev); rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); rsp->device_id = @@ -558,9 +557,9 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, rsp->secure_discard = cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); rsp->cache_policy = 0; - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (bdev_write_cache(rnbd_dev->bdev)) rsp->cache_policy |= RNBD_WRITEBACK; - if (blk_queue_fua(q)) + if (bdev_fua(rnbd_dev->bdev)) rsp->cache_policy |= RNBD_FUA; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index a8bcf3f664af..d624cc8eddc3 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -867,11 +867,12 @@ static int virtblk_probe(struct virtio_device *vdev) blk_queue_io_opt(q, blk_size * opt_io_size); if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { - q->limits.discard_granularity = blk_size; - virtio_cread(vdev, struct virtio_blk_config, discard_sector_alignment, &v); - q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0; + if (v) + q->limits.discard_granularity = v << SECTOR_SHIFT; + else + q->limits.discard_granularity = blk_size; virtio_cread(vdev, struct virtio_blk_config, max_discard_sectors, &v); @@ -888,8 +889,6 @@ static int virtblk_probe(struct virtio_device *vdev) v = sg_elems; blk_queue_max_discard_segments(q, min(v, MAX_DISCARD_SEGMENTS)); - - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) { diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index de42458195bc..a97f2bf5b01b 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -970,7 +970,6 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring, int status = BLKIF_RSP_OKAY; struct xen_blkif *blkif = ring->blkif; struct block_device *bdev = blkif->vbd.bdev; - unsigned long secure; struct phys_req preq; xen_blkif_get(blkif); @@ -987,13 +986,15 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring, } ring->st_ds_req++; - secure = (blkif->vbd.discard_secure && - (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? - BLKDEV_DISCARD_SECURE : 0; + if (blkif->vbd.discard_secure && + (req->u.discard.flag & BLKIF_DISCARD_SECURE)) + err = blkdev_issue_secure_erase(bdev, + req->u.discard.sector_number, + req->u.discard.nr_sectors, GFP_KERNEL); + else + err = blkdev_issue_discard(bdev, req->u.discard.sector_number, + req->u.discard.nr_sectors, GFP_KERNEL); - err = blkdev_issue_discard(bdev, req->u.discard.sector_number, - req->u.discard.nr_sectors, - GFP_KERNEL, secure); fail_response: if (err == -EOPNOTSUPP) { pr_debug("discard op failed, not supported\n"); diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f09040435e2e..97de13b14175 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -484,7 +484,6 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, { struct xen_vbd *vbd; struct block_device *bdev; - struct request_queue *q; vbd = &blkif->vbd; vbd->handle = handle; @@ -516,11 +515,9 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) vbd->type |= VDISK_REMOVABLE; - q = bdev_get_queue(bdev); - if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (bdev_write_cache(bdev)) vbd->flush_support = true; - - if (q && blk_queue_secure_erase(q)) + if (bdev_max_secure_erase_sectors(bdev)) vbd->discard_secure = true; vbd->feature_gnt_persistent = feature_persistent; @@ -578,22 +575,21 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info int err; int state = 0; struct block_device *bdev = be->blkif->vbd.bdev; - struct request_queue *q = bdev_get_queue(bdev); if (!xenbus_read_unsigned(dev->nodename, "discard-enable", 1)) return; - if (blk_queue_discard(q)) { + if (bdev_max_discard_sectors(bdev)) { err = xenbus_printf(xbt, dev->nodename, "discard-granularity", "%u", - q->limits.discard_granularity); + bdev_discard_granularity(bdev)); if (err) { dev_warn(&dev->dev, "writing discard-granularity (%d)", err); return; } err = xenbus_printf(xbt, dev->nodename, "discard-alignment", "%u", - q->limits.discard_alignment); + bdev_discard_alignment(bdev)); if (err) { dev_warn(&dev->dev, "writing discard-alignment (%d)", err); return; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 003056d4f7f5..0f3f5238f7bc 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -944,13 +944,13 @@ static void blkif_set_queue_limits(struct blkfront_info *info) blk_queue_flag_set(QUEUE_FLAG_VIRT, rq); if (info->feature_discard) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, rq); blk_queue_max_discard_sectors(rq, get_capacity(gd)); rq->limits.discard_granularity = info->discard_granularity ?: info->physical_sector_size; rq->limits.discard_alignment = info->discard_alignment; if (info->feature_secdiscard) - blk_queue_flag_set(QUEUE_FLAG_SECERASE, rq); + blk_queue_max_secure_erase_sectors(rq, + get_capacity(gd)); } /* Hard sector size and max sectors impersonate the equiv. hardware. */ @@ -1606,8 +1606,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, rq); - blk_queue_flag_clear(QUEUE_FLAG_SECERASE, rq); + blk_queue_max_discard_sectors(rq, 0); + blk_queue_max_secure_erase_sectors(rq, 0); } break; case BLKIF_OP_FLUSH_DISKCACHE: diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e9474b02012d..6853dd3c7d3a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1675,9 +1675,10 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; - start_time = disk_start_io_acct(bdev->bd_disk, SECTORS_PER_PAGE, op); + start_time = bdev_start_io_acct(bdev->bd_disk->part0, + SECTORS_PER_PAGE, op, jiffies); ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); - disk_end_io_acct(bdev->bd_disk, op, start_time); + bdev_end_io_acct(bdev->bd_disk->part0, op, start_time); out: /* * If I/O fails, just return error(ie, non-zero) without @@ -1786,7 +1787,7 @@ static ssize_t reset_store(struct device *dev, int ret; unsigned short do_reset; struct zram *zram; - struct block_device *bdev; + struct gendisk *disk; ret = kstrtou16(buf, 10, &do_reset); if (ret) @@ -1796,26 +1797,26 @@ static ssize_t reset_store(struct device *dev, return -EINVAL; zram = dev_to_zram(dev); - bdev = zram->disk->part0; + disk = zram->disk; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); /* Do not reset an active device or claimed device */ - if (bdev->bd_openers || zram->claim) { - mutex_unlock(&bdev->bd_disk->open_mutex); + if (disk_openers(disk) || zram->claim) { + mutex_unlock(&disk->open_mutex); return -EBUSY; } /* From now on, anyone can't open /dev/zram[0-9] */ zram->claim = true; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); /* Make sure all the pending I/O are finished */ - sync_blockdev(bdev); + sync_blockdev(disk->part0); zram_reset_device(zram); - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); zram->claim = false; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return len; } @@ -1952,7 +1953,6 @@ static int zram_add(void) blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); zram->disk->queue->limits.discard_granularity = PAGE_SIZE; blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, zram->disk->queue); /* * zram_bio_discard() will clear all logical blocks if logical block @@ -1987,19 +1987,18 @@ out_free_dev: static int zram_remove(struct zram *zram) { - struct block_device *bdev = zram->disk->part0; bool claimed; - mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) { - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_lock(&zram->disk->open_mutex); + if (disk_openers(zram->disk)) { + mutex_unlock(&zram->disk->open_mutex); return -EBUSY; } claimed = zram->claim; if (!claimed) zram->claim = true; - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&zram->disk->open_mutex); zram_debugfs_unregister(zram); @@ -2011,7 +2010,7 @@ static int zram_remove(struct zram *zram) ; } else { /* Make sure all the pending I/O are finished */ - sync_blockdev(bdev); + sync_blockdev(zram->disk->part0); zram_reset_device(zram); } diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 2dc9da683a13..416f723a2dbb 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -14,15 +14,6 @@ actually talk to the hardware. Suggestions are welcome. Patches that work are more welcome though. ;-) - To Do List: - ---------------------------------- - - -- Modify sysctl/proc interface. I plan on having one directory per - drive, with entries for outputing general drive information, and sysctl - based tunable parameters such as whether the tray should auto-close for - that drive. Suggestions (or patches) for this welcome! - - Revision History ---------------------------------- 1.00 Date Unknown -- David van Leeuwen @@ -648,6 +639,7 @@ int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi) mutex_unlock(&cdrom_mutex); return 0; } +EXPORT_SYMBOL(register_cdrom); #undef ENSURE void unregister_cdrom(struct cdrom_device_info *cdi) @@ -663,6 +655,7 @@ void unregister_cdrom(struct cdrom_device_info *cdi) cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name); } +EXPORT_SYMBOL(unregister_cdrom); int cdrom_get_media_event(struct cdrom_device_info *cdi, struct media_event_desc *med) @@ -690,6 +683,7 @@ int cdrom_get_media_event(struct cdrom_device_info *cdi, memcpy(med, &buffer[sizeof(*eh)], sizeof(*med)); return 0; } +EXPORT_SYMBOL(cdrom_get_media_event); static int cdrom_get_random_writable(struct cdrom_device_info *cdi, struct rwrt_feature_desc *rfd) @@ -1206,6 +1200,7 @@ err: cdi->use_count--; return ret; } +EXPORT_SYMBOL(cdrom_open); /* This code is similar to that in open_for_data. The routine is called whenever an audio play operation is requested. @@ -1301,6 +1296,7 @@ void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) cdo->tray_move(cdi, 1); } } +EXPORT_SYMBOL(cdrom_release); static int cdrom_read_mech_status(struct cdrom_device_info *cdi, struct cdrom_changer_info *buf) @@ -1382,6 +1378,7 @@ int cdrom_number_of_slots(struct cdrom_device_info *cdi) kfree(info); return nslots; } +EXPORT_SYMBOL(cdrom_number_of_slots); /* If SLOT < 0, unload the current slot. Otherwise, try to load SLOT. */ @@ -1581,6 +1578,7 @@ void init_cdrom_command(struct packet_command *cgc, void *buf, int len, cgc->data_direction = type; cgc->timeout = CDROM_DEF_TIMEOUT; } +EXPORT_SYMBOL(init_cdrom_command); /* DVD handling */ @@ -1999,6 +1997,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi, cgc->data_direction = CGC_DATA_READ; return cdo->generic_packet(cdi, cgc); } +EXPORT_SYMBOL(cdrom_mode_sense); int cdrom_mode_select(struct cdrom_device_info *cdi, struct packet_command *cgc) @@ -2014,6 +2013,7 @@ int cdrom_mode_select(struct cdrom_device_info *cdi, cgc->data_direction = CGC_DATA_WRITE; return cdo->generic_packet(cdi, cgc); } +EXPORT_SYMBOL(cdrom_mode_select); static int cdrom_read_subchannel(struct cdrom_device_info *cdi, struct cdrom_subchnl *subchnl, int mcn) @@ -2443,14 +2443,6 @@ static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi, return -EINVAL; } - /* - * ->select_disc is a hook to allow a driver-specific way of - * seleting disc. However, since there is no equivalent hook for - * cdrom_slot_status this may not actually be useful... - */ - if (cdi->ops->select_disc) - return cdi->ops->select_disc(cdi, arg); - cd_dbg(CD_CHANGER, "Using generic cdrom_select_disc()\n"); return cdrom_select_disc(cdi, arg); } @@ -2892,6 +2884,7 @@ use_toc: *last_written = toc.cdte_addr.lba; return 0; } +EXPORT_SYMBOL(cdrom_get_last_written); /* return the next writable block. also for udf file system. */ static int cdrom_get_next_writable(struct cdrom_device_info *cdi, @@ -3429,18 +3422,7 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, return -ENOSYS; } - -EXPORT_SYMBOL(cdrom_get_last_written); -EXPORT_SYMBOL(register_cdrom); -EXPORT_SYMBOL(unregister_cdrom); -EXPORT_SYMBOL(cdrom_open); -EXPORT_SYMBOL(cdrom_release); EXPORT_SYMBOL(cdrom_ioctl); -EXPORT_SYMBOL(cdrom_number_of_slots); -EXPORT_SYMBOL(cdrom_mode_select); -EXPORT_SYMBOL(cdrom_mode_sense); -EXPORT_SYMBOL(init_cdrom_command); -EXPORT_SYMBOL(cdrom_get_media_event); #ifdef CONFIG_SYSCTL diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 097577ae3c47..ce13c272c387 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -336,7 +336,7 @@ static int bch_allocator_thread(void *arg) mutex_unlock(&ca->set->bucket_lock); blkdev_issue_discard(ca->bdev, bucket_to_sector(ca->set, bucket), - ca->sb.bucket_size, GFP_KERNEL, 0); + ca->sb.bucket_size, GFP_KERNEL); mutex_lock(&ca->set->bucket_lock); } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 6230dfdd9286..7510d1c983a5 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -107,15 +107,16 @@ void bch_btree_verify(struct btree *b) void bch_data_verify(struct cached_dev *dc, struct bio *bio) { + unsigned int nr_segs = bio_segments(bio); struct bio *check; struct bio_vec bv, cbv; struct bvec_iter iter, citer = { 0 }; - check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); + check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_set_dev(check, bio->bi_bdev); - check->bi_opf = REQ_OP_READ; + bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, + REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; @@ -146,7 +147,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) bio_free_pages(check); out_put: - bio_put(check); + bio_uninit(check); + kfree(check); } #endif diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 320fcdfef48e..9c5dde73da88 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1005,7 +1005,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) bio_get(s->iop.bio); if (bio_op(bio) == REQ_OP_DISCARD && - !blk_queue_discard(bdev_get_queue(dc->bdev))) + !bdev_max_discard_sectors(dc->bdev)) goto insert_data; /* I/O request sent to backing device */ @@ -1115,7 +1115,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, bio->bi_private = ddip; if ((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(dc->bdev))) + !bdev_max_discard_sectors(dc->bdev)) bio->bi_end_io(bio); else submit_bio_noacct(bio); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index bf3de149d3c9..2f49e31142f6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -973,7 +973,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue); blk_queue_write_cache(q, true, true); @@ -2350,7 +2349,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, ca->bdev->bd_holder = ca; ca->sb_disk = sb_disk; - if (blk_queue_discard(bdev_get_queue(bdev))) + if (bdev_max_discard_sectors((bdev))) ca->discard = CACHE_DISCARD(&ca->sb); ret = cache_alloc(ca); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d1029d71ff3b..c6f677059214 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1151,7 +1151,7 @@ STORE(__bch_cache) if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); - if (blk_queue_discard(bdev_get_queue(ca->bdev))) + if (bdev_max_discard_sectors(ca->bdev)) ca->discard = v; if (v != CACHE_DISCARD(&ca->sb)) { diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index e9cbc70d5a0e..5ffa1dcf84cf 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -611,7 +611,8 @@ static void bio_complete(struct bio *bio) { struct dm_buffer *b = bio->bi_private; blk_status_t status = bio->bi_status; - bio_put(bio); + bio_uninit(bio); + kfree(bio); b->end_io(b, status); } @@ -626,16 +627,14 @@ static void use_bio(struct dm_buffer *b, int rw, sector_t sector, if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT)) vec_size += 2; - bio = bio_kmalloc(GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN, vec_size); + bio = bio_kmalloc(vec_size, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN); if (!bio) { dmio: use_dmio(b, rw, sector, n_sectors, offset); return; } - + bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, rw); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, b->c->bdev); - bio_set_op_attrs(bio, rw, 0); bio->bi_end_io = bio_complete; bio->bi_private = b; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 780a61bc6cc0..28c5de8eca4a 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3329,13 +3329,6 @@ static int cache_iterate_devices(struct dm_target *ti, return r; } -static bool origin_dev_supports_discard(struct block_device *origin_bdev) -{ - struct request_queue *q = bdev_get_queue(origin_bdev); - - return blk_queue_discard(q); -} - /* * If discard_passdown was enabled verify that the origin device * supports discards. Disable discard_passdown if not. @@ -3349,7 +3342,7 @@ static void disable_passdown_if_not_supported(struct cache *cache) if (!cache->features.discard_passdown) return; - if (!origin_dev_supports_discard(origin_bdev)) + if (!bdev_max_discard_sectors(origin_bdev)) reason = "discard unsupported"; else if (origin_limits->max_discard_sectors < cache->sectors_per_block) diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 128316a73d01..811b0a5379d0 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -2016,13 +2016,6 @@ static void clone_resume(struct dm_target *ti) do_waker(&clone->waker.work); } -static bool bdev_supports_discards(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - return (q && blk_queue_discard(q)); -} - /* * If discard_passdown was enabled verify that the destination device supports * discards. Disable discard_passdown if not. @@ -2036,7 +2029,7 @@ static void disable_passdown_if_not_supported(struct clone *clone) if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) return; - if (!bdev_supports_discards(dest_dev)) + if (!bdev_max_discard_sectors(dest_dev)) reason = "discard unsupported"; else if (dest_limits->max_discard_sectors < clone->region_size) reason = "max discard sectors smaller than a region"; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 5762366333a2..e4b95eaeec8c 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -311,7 +311,7 @@ static void do_region(int op, int op_flags, unsigned region, * Reject unsupported discard and write same requests. */ if (op == REQ_OP_DISCARD) - special_cmd_max_sectors = q->limits.max_discard_sectors; + special_cmd_max_sectors = bdev_max_discard_sectors(where->bdev); else if (op == REQ_OP_WRITE_ZEROES) special_cmd_max_sectors = q->limits.max_write_zeroes_sectors; if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) && diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index c9d036d6bb2e..e194226c89e5 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -866,9 +866,8 @@ static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv, static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct log_writes_c *lc = ti->private; - struct request_queue *q = bdev_get_queue(lc->dev->bdev); - if (!q || !blk_queue_discard(q)) { + if (!bdev_max_discard_sectors(lc->dev->bdev)) { lc->device_supports_discard = false; limits->discard_granularity = lc->sectorsize; limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2b26435a6946..9526ccbedafb 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2963,13 +2963,8 @@ static void configure_discard_support(struct raid_set *rs) raid456 = rs_is_raid456(rs); for (i = 0; i < rs->raid_disks; i++) { - struct request_queue *q; - - if (!rs->dev[i].rdev.bdev) - continue; - - q = bdev_get_queue(rs->dev[i].rdev.bdev); - if (!q || !blk_queue_discard(q)) + if (!rs->dev[i].rdev.bdev || + !bdev_max_discard_sectors(rs->dev[i].rdev.bdev)) return; if (raid456) { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 70935066fe00..11e48c0806f5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1886,9 +1886,7 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !blk_queue_nonrot(q); + return !bdev_nonrot(dev->bdev); } static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, @@ -1956,9 +1954,7 @@ static bool dm_table_supports_nowait(struct dm_table *t) static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !blk_queue_discard(q); + return !bdev_max_discard_sectors(dev->bdev); } static bool dm_table_supports_discards(struct dm_table *t) @@ -1990,9 +1986,7 @@ static int device_not_secure_erase_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !blk_queue_secure_erase(q); + return !bdev_max_secure_erase_sectors(dev->bdev); } static bool dm_table_supports_secure_erase(struct dm_table *t) @@ -2018,9 +2012,7 @@ static int device_requires_stable_pages(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - - return blk_queue_stable_writes(q); + return bdev_stable_writes(dev->bdev); } int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, @@ -2040,18 +2032,15 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); if (!dm_table_supports_discards(t)) { - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); - /* Must also clear discard limits... */ q->limits.max_discard_sectors = 0; q->limits.max_hw_discard_sectors = 0; q->limits.discard_granularity = 0; q->limits.discard_alignment = 0; q->limits.discard_misaligned = 0; - } else - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); + } - if (dm_table_supports_secure_erase(t)) - blk_queue_flag_set(QUEUE_FLAG_SECERASE, q); + if (!dm_table_supports_secure_erase(t)) + q->limits.max_secure_erase_sectors = 0; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4d25d0e27031..84c083f76673 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -398,8 +398,8 @@ static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t da sector_t s = block_to_sectors(tc->pool, data_b); sector_t len = block_to_sectors(tc->pool, data_e - data_b); - return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, - GFP_NOWAIT, 0, &op->bio); + return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT, + &op->bio); } static void end_discard(struct discard_op *op, int r) @@ -2802,13 +2802,6 @@ static void requeue_bios(struct pool *pool) /*---------------------------------------------------------------- * Binding of control targets to a pool object *--------------------------------------------------------------*/ -static bool data_dev_supports_discard(struct pool_c *pt) -{ - struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - - return blk_queue_discard(q); -} - static bool is_factor(sector_t block_size, uint32_t n) { return !sector_div(block_size, n); @@ -2828,7 +2821,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt) if (!pt->adjusted_pf.discard_passdown) return; - if (!data_dev_supports_discard(pt)) + if (!bdev_max_discard_sectors(pt->data_dev->bdev)) reason = "discard unsupported"; else if (data_limits->max_discard_sectors < pool->sectors_per_block) @@ -4057,8 +4050,6 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) /* * Must explicitly disallow stacking discard limits otherwise the * block layer will stack them if pool's data device has support. - * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the - * user to see that, so make sure to set all discard limits to 0. */ limits->discard_granularity = 0; return; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index cac295cc8840..0ec5d8b9b1a4 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1001,7 +1001,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, DMZ_BLOCK_SIZE); blk_limits_io_opt(limits, DMZ_BLOCK_SIZE); - limits->discard_alignment = DMZ_BLOCK_SIZE; + limits->discard_alignment = 0; limits->discard_granularity = DMZ_BLOCK_SIZE; limits->max_discard_sectors = chunk_sectors; limits->max_hw_discard_sectors = chunk_sectors; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 82957bd460e8..39081338ca61 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -955,7 +955,6 @@ void disable_discard(struct mapped_device *md) /* device doesn't really support DISCARD, disable it */ limits->max_discard_sectors = 0; - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); } void disable_write_zeroes(struct mapped_device *md) @@ -982,7 +981,7 @@ static void clone_endio(struct bio *bio) if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && - !q->limits.max_discard_sectors) + !bdev_max_discard_sectors(bio->bi_bdev)) disable_discard(md); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !q->limits.max_write_zeroes_sectors) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bfd6026d7809..d87f674ab762 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -639,14 +639,6 @@ re_read: daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); - /* Setup nodes/clustername only if bitmap version is - * cluster-compatible - */ - if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { - nodes = le32_to_cpu(sb->nodes); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, - sb->cluster_name, 64); - } /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -668,6 +660,16 @@ re_read: goto out; } + /* + * Setup nodes/clustername only if bitmap version is + * cluster-compatible + */ + if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { + nodes = le32_to_cpu(sb->nodes); + strscpy(bitmap->mddev->bitmap_info.cluster_name, + sb->cluster_name, 64); + } + /* keep the array size field of the bitmap superblock up to date */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); @@ -695,14 +697,13 @@ re_read: if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); err = 0; out: kunmap_atomic(sb); - /* Assigning chunksize is required for "re_read" */ - bitmap->mddev->bitmap_info.chunksize = chunksize; if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { + /* Assigning chunksize is required for "re_read" */ + bitmap->mddev->bitmap_info.chunksize = chunksize; err = md_setup_cluster(bitmap->mddev, nodes); if (err) { pr_warn("%s: Could not setup cluster service (%d)\n", @@ -713,18 +714,18 @@ out: goto re_read; } - out_no_sb: - if (test_bit(BITMAP_STALE, &bitmap->flags)) - bitmap->events_cleared = bitmap->mddev->events; - bitmap->mddev->bitmap_info.chunksize = chunksize; - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; - bitmap->mddev->bitmap_info.max_write_behind = write_behind; - bitmap->mddev->bitmap_info.nodes = nodes; - if (bitmap->mddev->bitmap_info.space == 0 || - bitmap->mddev->bitmap_info.space > sectors_reserved) - bitmap->mddev->bitmap_info.space = sectors_reserved; - if (err) { + if (err == 0) { + if (test_bit(BITMAP_STALE, &bitmap->flags)) + bitmap->events_cleared = bitmap->mddev->events; + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + bitmap->mddev->bitmap_info.nodes = nodes; + if (bitmap->mddev->bitmap_info.space == 0 || + bitmap->mddev->bitmap_info.space > sectors_reserved) + bitmap->mddev->bitmap_info.space = sectors_reserved; + } else { md_bitmap_print_sb(bitmap); if (bitmap->cluster_slot < 0) md_cluster_stop(bitmap->mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1c8a06b77c85..37cbcce3cc66 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -201,7 +201,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); goto out_err; } - strlcpy(res->name, name, namelen + 1); + strscpy(res->name, name, namelen + 1); if (with_lvb) { res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); if (!res->lksb.sb_lvbptr) { diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 0f55b079371b..138a3b25c5c8 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -64,7 +64,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) struct linear_conf *conf; struct md_rdev *rdev; int i, cnt; - bool discard_supported = false; conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL); if (!conf) @@ -96,9 +95,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->array_sectors += rdev->sectors; cnt++; - - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) - discard_supported = true; } if (cnt != raid_disks) { pr_warn("md/linear:%s: not enough drives present. Aborting!\n", @@ -106,11 +102,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) goto out; } - if (!discard_supported) - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue); - else - blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); - /* * Here we calculate the device offsets. */ @@ -252,7 +243,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) start_sector + data_offset; if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) { + !bdev_max_discard_sectors(bio->bi_bdev))) { /* Just ignore it */ bio_endio(bio); } else { diff --git a/drivers/md/md.c b/drivers/md/md.c index 309b3af906ad..707e802d0082 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2627,14 +2627,16 @@ static void sync_sbs(struct mddev *mddev, int nospares) static bool does_sb_need_changing(struct mddev *mddev) { - struct md_rdev *rdev; + struct md_rdev *rdev = NULL, *iter; struct mdp_superblock_1 *sb; int role; /* Find a good rdev */ - rdev_for_each(rdev, mddev) - if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + rdev_for_each(iter, mddev) + if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { + rdev = iter; break; + } /* No good device found. */ if (!rdev) @@ -2645,11 +2647,11 @@ static bool does_sb_need_changing(struct mddev *mddev) rdev_for_each(rdev, mddev) { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); /* Device activated? */ - if (role == 0xffff && rdev->raid_disk >=0 && + if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) return true; /* Device turned faulty? */ - if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) + if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) return true; } @@ -2984,10 +2986,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); - if (test_bit(Faulty, &rdev->flags)) - err = 0; - else + + if (test_bit(MD_BROKEN, &rdev->mddev->flags)) err = -EBUSY; + else + err = 0; } else if (cmd_match(buf, "remove")) { if (rdev->mddev->pers) { clear_bit(Blocked, &rdev->flags); @@ -4028,7 +4031,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; @@ -4353,10 +4356,9 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, * like active, but no writes have been seen for a while (100msec). * * broken - * RAID0/LINEAR-only: same as clean, but array is missing a member. - * It's useful because RAID0/LINEAR mounted-arrays aren't stopped - * when a member is gone, so this state will at least alert the - * user that something is wrong. +* Array is failed. It's useful because mounted-arrays aren't stopped +* when array is failed, so this state will at least alert the user that +* something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, write_pending, active_idle, broken, bad_word}; @@ -5763,7 +5765,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp) len--; if (len >= DISK_NAME_LEN) return -E2BIG; - strlcpy(buf, val, len+1); + strscpy(buf, val, len+1); if (strncmp(buf, "md_", 3) == 0) return md_alloc(0, buf); if (strncmp(buf, "md", 2) == 0 && @@ -5896,7 +5898,7 @@ int md_run(struct mddev *mddev) mddev->level = pers->level; mddev->new_level = pers->level; } - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { @@ -5991,8 +5993,7 @@ int md_run(struct mddev *mddev) bool nonrot = true; rdev_for_each(rdev, mddev) { - if (rdev->raid_disk >= 0 && - !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { nonrot = false; break; } @@ -7444,7 +7445,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) err = -ENODEV; else { md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) + if (test_bit(MD_BROKEN, &mddev->flags)) err = -EBUSY; } rcu_read_unlock(); @@ -7985,13 +7986,16 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) if (!mddev->pers || !mddev->pers->error_handler) return; - mddev->pers->error_handler(mddev,rdev); - if (mddev->degraded) + mddev->pers->error_handler(mddev, rdev); + + if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + if (!test_bit(MD_BROKEN, &mddev->flags)) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); md_new_event(); @@ -8585,7 +8589,7 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, { struct bio *discard_bio = NULL; - if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 0, + if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, &discard_bio) || !discard_bio) return; @@ -9671,7 +9675,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); if (test_bit(Candidate, &rdev2->flags)) { - if (role == 0xfffe) { + if (role == MD_DISK_ROLE_FAULTY) { pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); md_kick_rdev_from_array(rdev2); continue; @@ -9684,7 +9688,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) /* * got activated except reshape is happening. */ - if (rdev2->raid_disk == -1 && role != 0xffff && + if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { rdev2->saved_raid_disk = role; @@ -9701,7 +9705,8 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) * as faulty. The recovery is performed by the * one who initiated the error. */ - if ((role == 0xfffe) || (role == 0xfffd)) { + if (role == MD_DISK_ROLE_FAULTY || + role == MD_DISK_ROLE_JOURNAL) { md_error(mddev, rdev2); clear_bit(Blocked, &rdev2->flags); } @@ -9791,16 +9796,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) void md_reload_sb(struct mddev *mddev, int nr) { - struct md_rdev *rdev; + struct md_rdev *rdev = NULL, *iter; int err; /* Find the rdev */ - rdev_for_each_rcu(rdev, mddev) { - if (rdev->desc_nr == nr) + rdev_for_each_rcu(iter, mddev) { + if (iter->desc_nr == nr) { + rdev = iter; break; + } } - if (!rdev || rdev->desc_nr != nr) { + if (!rdev) { pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); return; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 6ac283864533..cf2cbb17acbd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); struct md_cluster_info; -/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ +/** + * enum mddev_flags - md device flags. + * @MD_ARRAY_FIRST_USE: First use of array, needs initialization. + * @MD_CLOSING: If set, we are closing the array, do not open it then. + * @MD_JOURNAL_CLEAN: A raid with journal is already clean. + * @MD_HAS_JOURNAL: The raid array has journal feature set. + * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took + * resync lock, need to release the lock. + * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as + * calls to md_error() will never cause the array to + * become failed. + * @MD_HAS_PPL: The raid array has PPL feature set. + * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. + * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata + * without taking reconfig_mutex. + * @MD_UPDATING_SB: md_check_recovery is updating the metadata without + * explicitly holding reconfig_mutex. + * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that + * array is ready yet. + * @MD_BROKEN: This is used to stop writes and mark array as failed. + * + * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added + */ enum mddev_flags { - MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */ - MD_CLOSING, /* If set, we are closing the array, do not open - * it then */ - MD_JOURNAL_CLEAN, /* A raid with journal is already clean */ - MD_HAS_JOURNAL, /* The raid array has journal feature set */ - MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node - * already took resync lock, need to - * release the lock */ - MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is - * supported as calls to md_error() will - * never cause the array to become failed. - */ - MD_HAS_PPL, /* The raid array has PPL feature set */ - MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ - MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update - * the metadata without taking reconfig_mutex. - */ - MD_UPDATING_SB, /* md_check_recovery is updating the metadata - * without explicitly holding reconfig_mutex. - */ - MD_NOT_READY, /* do_md_run() is active, so 'array_state' - * must not report that array is ready yet - */ - MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop - * I/O in case an array member is gone/failed. - */ + MD_ARRAY_FIRST_USE, + MD_CLOSING, + MD_JOURNAL_CLEAN, + MD_HAS_JOURNAL, + MD_CLUSTER_RESYNC_LOCKED, + MD_FAILFAST_SUPPORTED, + MD_HAS_PPL, + MD_HAS_MULTIPLE_PPLS, + MD_ALLOW_SB_UPDATE, + MD_UPDATING_SB, + MD_NOT_READY, + MD_BROKEN, }; enum mddev_sb_flags { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index b21e101183f4..e11701e394ca 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -128,21 +128,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); - if (conf->nr_strip_zones == 1) { - conf->layout = RAID0_ORIG_LAYOUT; - } else if (mddev->layout == RAID0_ORIG_LAYOUT || - mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { - conf->layout = mddev->layout; - } else if (default_layout == RAID0_ORIG_LAYOUT || - default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { - conf->layout = default_layout; - } else { - pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", - mdname(mddev)); - pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); - err = -ENOTSUPP; - goto abort; - } /* * now since we have the hard sector sizes, we can make sure * chunk size is a multiple of that sector size @@ -273,6 +258,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) (unsigned long long)smallest->sectors); } + if (conf->nr_strip_zones == 1 || conf->strip_zone[1].nb_dev == 1) { + conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; + } else if (default_layout == RAID0_ORIG_LAYOUT || + default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = default_layout; + } else { + pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", + mdname(mddev)); + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); + err = -EOPNOTSUPP; + goto abort; + } + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -399,7 +400,6 @@ static int raid0_run(struct mddev *mddev) conf = mddev->private; if (mddev->queue) { struct md_rdev *rdev; - bool discard_supported = false; blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); @@ -412,13 +412,7 @@ static int raid0_run(struct mddev *mddev) rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) - discard_supported = true; } - if (!discard_supported) - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue); - else - blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); } /* calculate array device size */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 99d5464a51f8..99d5af1362d7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -165,9 +165,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) * Allocate bios : 1 for reading, n-1 for writing */ for (j = pi->raid_disks ; j-- ; ) { - bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; + bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); r1_bio->bios[j] = bio; } /* @@ -206,8 +207,10 @@ out_free_pages: resync_free_pages(&rps[j]); out_free_bio: - while (++j < pi->raid_disks) - bio_put(r1_bio->bios[j]); + while (++j < pi->raid_disks) { + bio_uninit(r1_bio->bios[j]); + kfree(r1_bio->bios[j]); + } kfree(rps); out_free_r1bio: @@ -225,7 +228,8 @@ static void r1buf_pool_free(void *__r1_bio, void *data) for (i = pi->raid_disks; i--; ) { rp = get_resync_pages(r1bio->bios[i]); resync_free_pages(rp); - bio_put(r1bio->bios[i]); + bio_uninit(r1bio->bios[i]); + kfree(r1bio->bios[i]); } /* resync pages array stored in the 1st bio's .bi_private */ @@ -704,7 +708,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect /* At least two disks to choose from so failfast is OK */ set_bit(R1BIO_FailFast, &r1_bio->state); - nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + nonrot = bdev_nonrot(rdev->bdev); has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); dist = abs(this_sector - conf->mirrors[disk].head_position); @@ -802,7 +806,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) + !bdev_max_discard_sectors(bio->bi_bdev))) /* Just ignore it */ bio_endio(bio); else @@ -1637,30 +1641,39 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "]"); } +/** + * raid1_error() - RAID1 error handler. + * @mddev: affected md device. + * @rdev: member device to fail. + * + * The routine acknowledges &rdev failure and determines new @mddev state. + * If it failed, then: + * - &MD_BROKEN flag is set in &mddev->flags. + * - recovery is disabled. + * Otherwise, it must be degraded: + * - recovery is interrupted. + * - &mddev->degraded is bumped. + * + * @rdev is marked as &Faulty excluding case when array is failed and + * &mddev->fail_last_dev is off. + */ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r1conf *conf = mddev->private; unsigned long flags; - /* - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks with "fail_last_dev == false", - * ignore the error, let the next level up know. - * else mark the drive as failed - */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev - && (conf->raid_disks - mddev->degraded) == 1) { - /* - * Don't fail the drive, act as though we were just a - * normal single drive. - * However don't try a recovery from this drive as - * it is very likely to fail. - */ - conf->recovery_disabled = mddev->recovery_disabled; - spin_unlock_irqrestore(&conf->device_lock, flags); - return; + + if (test_bit(In_sync, &rdev->flags) && + (conf->raid_disks - mddev->degraded) == 1) { + set_bit(MD_BROKEN, &mddev->flags); + + if (!mddev->fail_last_dev) { + conf->recovery_disabled = mddev->recovery_disabled; + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } } set_bit(Blocked, &rdev->flags); if (test_and_clear_bit(In_sync, &rdev->flags)) @@ -1826,8 +1839,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } - if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; } @@ -3106,7 +3117,6 @@ static int raid1_run(struct mddev *mddev) int i; struct md_rdev *rdev; int ret; - bool discard_supported = false; if (mddev->level != 1) { pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n", @@ -3141,8 +3151,6 @@ static int raid1_run(struct mddev *mddev) continue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) - discard_supported = true; } mddev->degraded = 0; @@ -3179,15 +3187,6 @@ static int raid1_run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); - if (mddev->queue) { - if (discard_supported) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, - mddev->queue); - else - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, - mddev->queue); - } - ret = md_integrity_register(mddev); if (ret) { md_unregister_thread(&mddev->thread); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index dfe7d62d3fbd..dfa576cdf11c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -145,15 +145,17 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) * Allocate bios. */ for (j = nalloc ; j-- ; ) { - bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; + bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); r10_bio->devs[j].bio = bio; if (!conf->have_replacement) continue; - bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; + bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); r10_bio->devs[j].repl_bio = bio; } /* @@ -197,9 +199,11 @@ out_free_pages: out_free_bio: for ( ; j < nalloc; j++) { if (r10_bio->devs[j].bio) - bio_put(r10_bio->devs[j].bio); + bio_uninit(r10_bio->devs[j].bio); + kfree(r10_bio->devs[j].bio); if (r10_bio->devs[j].repl_bio) - bio_put(r10_bio->devs[j].repl_bio); + bio_uninit(r10_bio->devs[j].repl_bio); + kfree(r10_bio->devs[j].repl_bio); } kfree(rps); out_free_r10bio: @@ -220,12 +224,15 @@ static void r10buf_pool_free(void *__r10_bio, void *data) if (bio) { rp = get_resync_pages(bio); resync_free_pages(rp); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } bio = r10bio->devs[j].repl_bio; - if (bio) - bio_put(bio); + if (bio) { + bio_uninit(bio); + kfree(bio); + } } /* resync pages array stored in the 1st bio's .bi_private */ @@ -796,7 +803,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + nonrot = bdev_nonrot(rdev->bdev); has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); if (min_pending > pending && nonrot) { @@ -888,7 +895,7 @@ static void flush_pending_writes(struct r10conf *conf) if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) + !bdev_max_discard_sectors(bio->bi_bdev))) /* Just ignore it */ bio_endio(bio); else @@ -1083,7 +1090,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) + !bdev_max_discard_sectors(bio->bi_bdev))) /* Just ignore it */ bio_endio(bio); else @@ -1963,32 +1970,40 @@ static int enough(struct r10conf *conf, int ignore) _enough(conf, 1, ignore); } +/** + * raid10_error() - RAID10 error handler. + * @mddev: affected md device. + * @rdev: member device to fail. + * + * The routine acknowledges &rdev failure and determines new @mddev state. + * If it failed, then: + * - &MD_BROKEN flag is set in &mddev->flags. + * Otherwise, it must be degraded: + * - recovery is interrupted. + * - &mddev->degraded is bumped. + + * @rdev is marked as &Faulty excluding case when array is failed and + * &mddev->fail_last_dev is off. + */ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r10conf *conf = mddev->private; unsigned long flags; - /* - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks with "fail_last_dev == false", - * ignore the error, let the next level up know. - * else mark the drive as failed - */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev - && !enough(conf, rdev->raid_disk)) { - /* - * Don't fail the drive, just return an IO error. - */ - spin_unlock_irqrestore(&conf->device_lock, flags); - return; + + if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { + set_bit(MD_BROKEN, &mddev->flags); + + if (!mddev->fail_last_dev) { + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } } if (test_and_clear_bit(In_sync, &rdev->flags)) mddev->degraded++; - /* - * If recovery is running, make sure it aborts. - */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); @@ -2144,8 +2159,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; @@ -4069,7 +4082,6 @@ static int raid10_run(struct mddev *mddev) sector_t size; sector_t min_offset_diff = 0; int first = 1; - bool discard_supported = false; if (mddev_init_writes_pending(mddev) < 0) return -ENOMEM; @@ -4140,20 +4152,9 @@ static int raid10_run(struct mddev *mddev) rdev->data_offset << 9); disk->head_position = 0; - - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) - discard_supported = true; first = 0; } - if (mddev->queue) { - if (discard_supported) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, - mddev->queue); - else - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, - mddev->queue); - } /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { pr_err("md/raid10:%s: not enough operational mirrors.\n", diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a7d50ff9020a..094a4042589e 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1318,7 +1318,7 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, r5l_write_super(log, end); - if (!blk_queue_discard(bdev_get_queue(bdev))) + if (!bdev_max_discard_sectors(bdev)) return; mddev = log->rdev->mddev; @@ -1344,14 +1344,14 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, if (log->last_checkpoint < end) { blkdev_issue_discard(bdev, log->last_checkpoint + log->rdev->data_offset, - end - log->last_checkpoint, GFP_NOIO, 0); + end - log->last_checkpoint, GFP_NOIO); } else { blkdev_issue_discard(bdev, log->last_checkpoint + log->rdev->data_offset, log->device_size - log->last_checkpoint, - GFP_NOIO, 0); + GFP_NOIO); blkdev_issue_discard(bdev, log->rdev->data_offset, end, - GFP_NOIO, 0); + GFP_NOIO); } } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index d3962d92df18..55d065a87b89 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -883,7 +883,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)r_sector, dd_idx, (unsigned long long)sector); - rdev = conf->disks[dd_idx].rdev; + /* Array has not started so rcu dereference is safe */ + rdev = rcu_dereference_protected( + conf->disks[dd_idx].rdev, 1); if (!rdev || (!test_bit(In_sync, &rdev->flags) && sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", @@ -934,7 +936,10 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, parity_sector = raid5_compute_sector(conf, r_sector_first + i, 0, &disk, &sh); BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); - parity_rdev = conf->disks[sh.pd_idx].rdev; + + /* Array has not started so rcu dereference is safe */ + parity_rdev = rcu_dereference_protected( + conf->disks[sh.pd_idx].rdev, 1); BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); pr_debug("%s:%*s write parity at sector %llu, disk %s\n", @@ -1404,7 +1409,9 @@ int ppl_init_log(struct r5conf *conf) for (i = 0; i < ppl_conf->count; i++) { struct ppl_log *log = &ppl_conf->child_logs[i]; - struct md_rdev *rdev = conf->disks[i].rdev; + /* Array has not started so rcu dereference is safe */ + struct md_rdev *rdev = + rcu_dereference_protected(conf->disks[i].rdev, 1); mutex_init(&log->io_mutex); spin_lock_init(&log->io_list_lock); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 351d341a1ffa..39038fa8b1c8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -79,18 +79,21 @@ static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) } static inline void lock_device_hash_lock(struct r5conf *conf, int hash) + __acquires(&conf->device_lock) { spin_lock_irq(conf->hash_locks + hash); spin_lock(&conf->device_lock); } static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) + __releases(&conf->device_lock) { spin_unlock(&conf->device_lock); spin_unlock_irq(conf->hash_locks + hash); } static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) + __acquires(&conf->device_lock) { int i; spin_lock_irq(conf->hash_locks); @@ -100,6 +103,7 @@ static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) } static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) + __releases(&conf->device_lock) { int i; spin_unlock(&conf->device_lock); @@ -164,6 +168,7 @@ static bool stripe_is_lowprio(struct stripe_head *sh) } static void raid5_wakeup_stripe_thread(struct stripe_head *sh) + __must_hold(&sh->raid_conf->device_lock) { struct r5conf *conf = sh->raid_conf; struct r5worker_group *group; @@ -211,6 +216,7 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) { int i; int injournal = 0; /* number of date pages with R5_InJournal */ @@ -296,6 +302,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) { if (atomic_dec_and_test(&sh->count)) do_release_stripe(conf, sh, temp_inactive_list); @@ -350,9 +357,9 @@ static void release_inactive_stripe_list(struct r5conf *conf, } } -/* should hold conf->device_lock already */ static int release_stripe_list(struct r5conf *conf, struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) { struct stripe_head *sh, *t; int count = 0; @@ -629,6 +636,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, * This is because some failed devices may only affect one * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices. + * + * Most calls to this function hold &conf->device_lock. Calls + * in raid5_run() do not require the lock as no other threads + * have been started yet. */ int raid5_calc_degraded(struct r5conf *conf) { @@ -686,17 +697,17 @@ int raid5_calc_degraded(struct r5conf *conf) return degraded; } -static int has_failed(struct r5conf *conf) +static bool has_failed(struct r5conf *conf) { - int degraded; + int degraded = conf->mddev->degraded; - if (conf->mddev->reshape_position == MaxSector) - return conf->mddev->degraded > conf->max_degraded; + if (test_bit(MD_BROKEN, &conf->mddev->flags)) + return true; - degraded = raid5_calc_degraded(conf); - if (degraded > conf->max_degraded) - return 1; - return 0; + if (conf->mddev->reshape_position != MaxSector) + degraded = raid5_calc_degraded(conf); + + return degraded > conf->max_degraded; } struct stripe_head * @@ -2648,6 +2659,28 @@ static void shrink_stripes(struct r5conf *conf) conf->slab_cache = NULL; } +/* + * This helper wraps rcu_dereference_protected() and can be used when + * it is known that the nr_pending of the rdev is elevated. + */ +static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev) +{ + return rcu_dereference_protected(rdev, + atomic_read(&rcu_access_pointer(rdev)->nr_pending)); +} + +/* + * This helper wraps rcu_dereference_protected() and should be used + * when it is known that the mddev_lock() is held. This is safe + * seeing raid5_remove_disk() has the same lock held. + */ +static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev, + struct md_rdev __rcu *rdev) +{ + return rcu_dereference_protected(rdev, + lockdep_is_held(&mddev->reconfig_mutex)); +} + static void raid5_end_read_request(struct bio * bi) { struct stripe_head *sh = bi->bi_private; @@ -2674,9 +2707,9 @@ static void raid5_end_read_request(struct bio * bi) * In that case it moved down to 'rdev'. * rdev is not removed until all requests are finished. */ - rdev = conf->disks[i].replacement; + rdev = rdev_pend_deref(conf->disks[i].replacement); if (!rdev) - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); if (use_new_offset(conf, sh)) s = sh->sector + rdev->new_data_offset; @@ -2790,11 +2823,11 @@ static void raid5_end_write_request(struct bio *bi) for (i = 0 ; i < disks; i++) { if (bi == &sh->dev[i].req) { - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); break; } if (bi == &sh->dev[i].rreq) { - rdev = conf->disks[i].replacement; + rdev = rdev_pend_deref(conf->disks[i].replacement); if (rdev) replacement = 1; else @@ -2802,7 +2835,7 @@ static void raid5_end_write_request(struct bio *bi) * replaced it. rdev is not removed * until all requests are finished. */ - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); break; } } @@ -2863,34 +2896,31 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) unsigned long flags; pr_debug("raid456: error called\n"); + pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n", + mdname(mddev), bdevname(rdev->bdev, b)); + spin_lock_irqsave(&conf->device_lock, flags); - - if (test_bit(In_sync, &rdev->flags) && - mddev->degraded == conf->max_degraded) { - /* - * Don't allow to achieve failed state - * Don't try to recover this device - */ - conf->recovery_disabled = mddev->recovery_disabled; - spin_unlock_irqrestore(&conf->device_lock, flags); - return; - } - set_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); mddev->degraded = raid5_calc_degraded(conf); + + if (has_failed(conf)) { + set_bit(MD_BROKEN, &conf->mddev->flags); + conf->recovery_disabled = mddev->recovery_disabled; + + pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n", + mdname(mddev), mddev->degraded, conf->raid_disks); + } else { + pr_crit("md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), conf->raid_disks - mddev->degraded); + } + spin_unlock_irqrestore(&conf->device_lock, flags); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); - pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" - "md/raid:%s: Operation continuing on %d devices.\n", - mdname(mddev), - bdevname(rdev->bdev, b), - mdname(mddev), - conf->raid_disks - mddev->degraded); r5c_update_on_rdev_error(mddev, rdev); } @@ -5213,23 +5243,23 @@ finish: struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_WriteError, &dev->flags)) { /* We own a safe reference to the rdev */ - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); if (!rdev_set_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); rdev_clear_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { - rdev = conf->disks[i].replacement; + rdev = rdev_pend_deref(conf->disks[i].replacement); if (!rdev) /* rdev have been moved down */ - rdev = conf->disks[i].rdev; + rdev = rdev_pend_deref(conf->disks[i].rdev); rdev_clear_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); @@ -5256,6 +5286,7 @@ finish: } static void raid5_activate_delayed(struct r5conf *conf) + __must_hold(&conf->device_lock) { if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { while (!list_empty(&conf->delayed_list)) { @@ -5273,9 +5304,9 @@ static void raid5_activate_delayed(struct r5conf *conf) } static void activate_bit_delay(struct r5conf *conf, - struct list_head *temp_inactive_list) + struct list_head *temp_inactive_list) + __must_hold(&conf->device_lock) { - /* device_lock is held */ struct list_head head; list_add(&head, &conf->bitmap_list); list_del_init(&conf->bitmap_list); @@ -5500,6 +5531,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) * handle_list. */ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) + __must_hold(&conf->device_lock) { struct stripe_head *sh, *tmp; struct list_head *handle_list = NULL; @@ -6288,7 +6320,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n */ rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; @@ -6371,8 +6403,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, static int handle_active_stripes(struct r5conf *conf, int group, struct r5worker *worker, struct list_head *temp_inactive_list) - __releases(&conf->device_lock) - __acquires(&conf->device_lock) + __must_hold(&conf->device_lock) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0, hash; @@ -7166,7 +7197,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) int i; int group_cnt; struct r5worker_group *new_group; - int ret; + int ret = -ENOMEM; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -7225,6 +7256,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->device_lock); seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock); mutex_init(&conf->cache_size_mutex); + init_waitqueue_head(&conf->wait_for_quiescent); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -7242,7 +7274,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) continue; - if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + if (bdev_nonrot(rdev->bdev)) { conf->batch_bio_dispatch = false; break; } @@ -7302,11 +7334,13 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->level = mddev->new_level; conf->chunk_sectors = mddev->new_chunk_sectors; - if (raid5_alloc_percpu(conf) != 0) + ret = raid5_alloc_percpu(conf); + if (ret) goto abort; pr_debug("raid456: run(%s) called.\n", mdname(mddev)); + ret = -EIO; rdev_for_each(rdev, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= max_disks @@ -7317,11 +7351,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (test_bit(Replacement, &rdev->flags)) { if (disk->replacement) goto abort; - disk->replacement = rdev; + RCU_INIT_POINTER(disk->replacement, rdev); } else { if (disk->rdev) goto abort; - disk->rdev = rdev; + RCU_INIT_POINTER(disk->rdev, rdev); } if (test_bit(In_sync, &rdev->flags)) { @@ -7370,6 +7404,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (grow_stripes(conf, conf->min_nr_stripes)) { pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", mdname(mddev), memory); + ret = -ENOMEM; goto abort; } else pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); @@ -7383,7 +7418,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.count_objects = raid5_cache_count; conf->shrinker.batch = 128; conf->shrinker.flags = 0; - if (register_shrinker(&conf->shrinker)) { + ret = register_shrinker(&conf->shrinker); + if (ret) { pr_warn("md/raid:%s: couldn't register shrinker.\n", mdname(mddev)); goto abort; @@ -7394,17 +7430,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (!conf->thread) { pr_warn("md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); + ret = -ENOMEM; goto abort; } return conf; abort: - if (conf) { + if (conf) free_conf(conf); - return ERR_PTR(-EIO); - } else - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) @@ -7621,17 +7656,18 @@ static int raid5_run(struct mddev *mddev) for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; i++) { - rdev = conf->disks[i].rdev; + rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); if (!rdev && conf->disks[i].replacement) { /* The replacement is all we have yet */ - rdev = conf->disks[i].replacement; + rdev = rdev_mdlock_deref(mddev, + conf->disks[i].replacement); conf->disks[i].replacement = NULL; clear_bit(Replacement, &rdev->flags); - conf->disks[i].rdev = rdev; + rcu_assign_pointer(conf->disks[i].rdev, rdev); } if (!rdev) continue; - if (conf->disks[i].replacement && + if (rcu_access_pointer(conf->disks[i].replacement) && conf->reshape_progress != MaxSector) { /* replacements and reshape simply do not mix. */ pr_warn("md: cannot handle concurrent replacement and reshape.\n"); @@ -7749,7 +7785,6 @@ static int raid5_run(struct mddev *mddev) */ stripe = stripe * PAGE_SIZE; stripe = roundup_pow_of_two(stripe); - mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; blk_queue_max_write_zeroes_sectors(mddev->queue, 0); @@ -7776,14 +7811,10 @@ static int raid5_run(struct mddev *mddev) * A better idea might be to turn DISCARD into WRITE_ZEROES * requests, as that is required to be safe. */ - if (devices_handle_discard_safely && - mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && - mddev->queue->limits.discard_granularity >= stripe) - blk_queue_flag_set(QUEUE_FLAG_DISCARD, - mddev->queue); - else - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, - mddev->queue); + if (!devices_handle_discard_safely || + mddev->queue->limits.max_discard_sectors < (stripe >> 9) || + mddev->queue->limits.discard_granularity < stripe) + blk_queue_max_discard_sectors(mddev->queue, 0); blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); } @@ -7832,8 +7863,8 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) static void print_raid5_conf (struct r5conf *conf) { + struct md_rdev *rdev; int i; - struct disk_info *tmp; pr_debug("RAID conf printout:\n"); if (!conf) { @@ -7844,50 +7875,54 @@ static void print_raid5_conf (struct r5conf *conf) conf->raid_disks, conf->raid_disks - conf->mddev->degraded); + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->disks + i; - if (tmp->rdev) + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev) pr_debug(" disk %d, o:%d, dev:%s\n", - i, !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev, b)); + i, !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev, b)); } + rcu_read_unlock(); } static int raid5_spare_active(struct mddev *mddev) { int i; struct r5conf *conf = mddev->private; - struct disk_info *tmp; + struct md_rdev *rdev, *replacement; int count = 0; unsigned long flags; for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->disks + i; - if (tmp->replacement - && tmp->replacement->recovery_offset == MaxSector - && !test_bit(Faulty, &tmp->replacement->flags) - && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); + replacement = rdev_mdlock_deref(mddev, + conf->disks[i].replacement); + if (replacement + && replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &replacement->flags) + && !test_and_set_bit(In_sync, &replacement->flags)) { /* Replacement has just become active. */ - if (!tmp->rdev - || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + if (!rdev + || !test_and_clear_bit(In_sync, &rdev->flags)) count++; - if (tmp->rdev) { + if (rdev) { /* Replaced device not technically faulty, * but we need to be sure it gets removed * and never re-added. */ - set_bit(Faulty, &tmp->rdev->flags); + set_bit(Faulty, &rdev->flags); sysfs_notify_dirent_safe( - tmp->rdev->sysfs_state); + rdev->sysfs_state); } - sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); - } else if (tmp->rdev - && tmp->rdev->recovery_offset == MaxSector - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + sysfs_notify_dirent_safe(replacement->sysfs_state); + } else if (rdev + && rdev->recovery_offset == MaxSector + && !test_bit(Faulty, &rdev->flags) + && !test_and_set_bit(In_sync, &rdev->flags)) { count++; - sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); @@ -7902,8 +7937,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r5conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; - struct md_rdev **rdevp; + struct md_rdev __rcu **rdevp; struct disk_info *p = conf->disks + number; + struct md_rdev *tmp; print_raid5_conf(conf); if (test_bit(Journal, &rdev->flags) && conf->log) { @@ -7921,9 +7957,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) log_exit(conf); return 0; } - if (rdev == p->rdev) + if (rdev == rcu_access_pointer(p->rdev)) rdevp = &p->rdev; - else if (rdev == p->replacement) + else if (rdev == rcu_access_pointer(p->replacement)) rdevp = &p->replacement; else return 0; @@ -7943,18 +7979,20 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (!test_bit(Faulty, &rdev->flags) && mddev->recovery_disabled != conf->recovery_disabled && !has_failed(conf) && - (!p->replacement || p->replacement == rdev) && + (!rcu_access_pointer(p->replacement) || + rcu_access_pointer(p->replacement) == rdev) && number < conf->raid_disks) { err = -EBUSY; goto abort; } *rdevp = NULL; if (!test_bit(RemoveSynchronized, &rdev->flags)) { + lockdep_assert_held(&mddev->reconfig_mutex); synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ err = -EBUSY; - *rdevp = rdev; + rcu_assign_pointer(*rdevp, rdev); } } if (!err) { @@ -7962,17 +8000,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (err) goto abort; } - if (p->replacement) { + + tmp = rcu_access_pointer(p->replacement); + if (tmp) { /* We must have just cleared 'rdev' */ - p->rdev = p->replacement; - clear_bit(Replacement, &p->replacement->flags); + rcu_assign_pointer(p->rdev, tmp); + clear_bit(Replacement, &tmp->flags); smp_mb(); /* Make sure other CPUs may see both as identical * but will never see neither - if they are careful */ - p->replacement = NULL; + rcu_assign_pointer(p->replacement, NULL); if (!err) - err = log_modify(conf, p->rdev, true); + err = log_modify(conf, tmp, true); } clear_bit(WantReplacement, &rdev->flags); @@ -7988,6 +8028,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int ret, err = -EEXIST; int disk; struct disk_info *p; + struct md_rdev *tmp; int first = 0; int last = conf->raid_disks - 1; @@ -8045,7 +8086,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) } for (disk = first; disk <= last; disk++) { p = conf->disks + disk; - if (test_bit(WantReplacement, &p->rdev->flags) && + tmp = rdev_mdlock_deref(mddev, p->rdev); + if (test_bit(WantReplacement, &tmp->flags) && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); @@ -8336,6 +8378,7 @@ static void end_reshape(struct r5conf *conf) static void raid5_finish_reshape(struct mddev *mddev) { struct r5conf *conf = mddev->private; + struct md_rdev *rdev; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -8347,10 +8390,12 @@ static void raid5_finish_reshape(struct mddev *mddev) for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; d++) { - struct md_rdev *rdev = conf->disks[d].rdev; + rdev = rdev_mdlock_deref(mddev, + conf->disks[d].rdev); if (rdev) clear_bit(In_sync, &rdev->flags); - rdev = conf->disks[d].replacement; + rdev = rdev_mdlock_deref(mddev, + conf->disks[d].replacement); if (rdev) clear_bit(In_sync, &rdev->flags); } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9e8486a9e445..638d29863503 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -473,7 +473,8 @@ enum { */ struct disk_info { - struct md_rdev *rdev, *replacement; + struct md_rdev __rcu *rdev; + struct md_rdev __rcu *replacement; struct page *extra_page; /* extra page to use in prexor */ }; @@ -560,6 +561,16 @@ struct r5pending_data { struct bio_list bios; }; +struct raid5_percpu { + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address + * conversions + */ + int scribble_obj_size; + local_lock_t lock; +}; + struct r5conf { struct hlist_head *stripe_hashtbl; /* only protect corresponding hash list and inactive_list */ @@ -635,15 +646,7 @@ struct r5conf { */ int recovery_disabled; /* per cpu variables */ - struct raid5_percpu { - struct page *spare_page; /* Used when checking P/Q in raid6 */ - void *scribble; /* space for constructing buffer - * lists and performing address - * conversions - */ - int scribble_obj_size; - local_lock_t lock; - } __percpu *percpu; + struct raid5_percpu __percpu *percpu; int scribble_disks; int scribble_sectors; struct hlist_node node; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index c69b2d9df6f1..a3d446005571 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -183,14 +183,13 @@ static void mmc_queue_setup_discard(struct request_queue *q, if (!max_discard) return; - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); blk_queue_max_discard_sectors(q, max_discard); q->limits.discard_granularity = card->pref_erase << 9; /* granularity must not be greater than max. discard */ if (card->pref_erase > max_discard) q->limits.discard_granularity = SECTOR_SIZE; if (mmc_can_secure_erase_trim(card)) - blk_queue_flag_set(QUEUE_FLAG_SECERASE, q); + blk_queue_max_secure_erase_sectors(q, max_discard); } static unsigned short mmc_get_max_segments(struct mmc_host *host) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 64d2b093f114..f73172111465 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -377,7 +377,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); if (tr->discard) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, new->rq); blk_queue_max_discard_sectors(new->rq, UINT_MAX); new->rq->limits.discard_granularity = tr->blksize; } diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index 7d49eb34b348..4910543f00ff 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -4,7 +4,6 @@ * Copyright (c) 2022, Oracle and/or its affiliates */ -#include #include "nvme.h" #ifdef CONFIG_NVME_VERBOSE_ERRORS @@ -92,6 +91,7 @@ static const char * const nvme_statuses[] = { [NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected", [NVME_SC_CMD_INTERRUPTED] = "Command Interrupted", [NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error", + [NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY] = "Admin Command Media Not Ready", [NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set", [NVME_SC_LBA_RANGE] = "LBA Out of Range", [NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded", @@ -155,10 +155,13 @@ static const char * const nvme_statuses[] = { [NVME_SC_COMPARE_FAILED] = "Compare Failure", [NVME_SC_ACCESS_DENIED] = "Access Denied", [NVME_SC_UNWRITTEN_BLOCK] = "Deallocated or Unwritten Logical Block", + [NVME_SC_INTERNAL_PATH_ERROR] = "Internal Pathing Error", [NVME_SC_ANA_PERSISTENT_LOSS] = "Asymmetric Access Persistent Loss", [NVME_SC_ANA_INACCESSIBLE] = "Asymmetric Access Inaccessible", [NVME_SC_ANA_TRANSITION] = "Asymmetric Access Transition", + [NVME_SC_CTRL_PATH_ERROR] = "Controller Pathing Error", [NVME_SC_HOST_PATH_ERROR] = "Host Pathing Error", + [NVME_SC_HOST_ABORTED_CMD] = "Host Aborted Command", }; const unsigned char *nvme_get_error_status_str(u16 status) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1a984045e49c..72f7c955c707 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1207,6 +1207,7 @@ static void nvme_keep_alive_work(struct work_struct *work) rq->timeout = ctrl->kato * HZ; rq->end_io_data = ctrl; + rq->rq_flags |= RQF_QUIET; blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io); } @@ -1426,6 +1427,32 @@ out_free_id: return error; } +static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_id_ns_cs_indep **id) +{ + struct nvme_command c = { + .identify.opcode = nvme_admin_identify, + .identify.nsid = cpu_to_le32(nsid), + .identify.cns = NVME_ID_CNS_NS_CS_INDEP, + }; + int ret; + + *id = kmalloc(sizeof(**id), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); + if (ret) { + dev_warn(ctrl->device, + "Identify namespace (CS independent) failed (%d)\n", + ret); + kfree(*id); + return ret; + } + + return 0; +} + static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, u32 *result) { @@ -1621,20 +1648,22 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) u32 size = queue_logical_block_size(queue); if (ctrl->max_discard_sectors == 0) { - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); + blk_queue_max_discard_sectors(queue, 0); return; } BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - queue->limits.discard_alignment = 0; queue->limits.discard_granularity = size; /* If discard is already enabled, don't reset queue limits */ - if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) + if (queue->limits.max_discard_sectors) return; + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX)) + ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl); + blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); @@ -1771,7 +1800,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); - blk_queue_dma_alignment(q, 7); + blk_queue_dma_alignment(q, 3); blk_queue_write_cache(q, vwc, vwc); } @@ -2100,10 +2129,9 @@ static const struct block_device_operations nvme_bdev_ops = { .pr_ops = &nvme_pr_ops, }; -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) { - unsigned long timeout = - ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies; u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; int ret; @@ -2116,7 +2144,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) usleep_range(1000, 2000); if (fatal_signal_pending(current)) return -EINTR; - if (time_after(jiffies, timeout)) { + if (time_after(jiffies, timeout_jiffies)) { dev_err(ctrl->device, "Device not ready; aborting %s, CSTS=0x%x\n", enabled ? "initialisation" : "reset", csts); @@ -2147,13 +2175,14 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl) if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - return nvme_wait_ready(ctrl, ctrl->cap, false); + return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { unsigned dev_page_min; + u32 timeout; int ret; ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); @@ -2174,6 +2203,27 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ctrl->ctrl_config = NVME_CC_CSS_CSI; else ctrl->ctrl_config = NVME_CC_CSS_NVM; + + if (ctrl->cap & NVME_CAP_CRMS_CRWMS) { + u32 crto; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto); + if (ret) { + dev_err(ctrl->device, "Reading CRTO failed (%d)\n", + ret); + return ret; + } + + if (ctrl->cap & NVME_CAP_CRMS_CRIMS) { + ctrl->ctrl_config |= NVME_CC_CRIME; + timeout = NVME_CRTO_CRIMT(crto); + } else { + timeout = NVME_CRTO_CRWMT(crto); + } + } else { + timeout = NVME_CAP_TIMEOUT(ctrl->cap); + } + ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; @@ -2182,7 +2232,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, ctrl->cap, true); + return nvme_wait_ready(ctrl, timeout, true); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); @@ -2894,8 +2944,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) if (id->dmrl) ctrl->max_discard_segments = id->dmrl; - if (id->dmrsl) - ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl); + ctrl->dmrsl = le32_to_cpu(id->dmrsl); if (id->wzsl) ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); @@ -3080,10 +3129,6 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) if (ret) return ret; - ret = nvme_init_non_mdts_limits(ctrl); - if (ret < 0) - return ret; - ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; @@ -4092,11 +4137,26 @@ out: static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns_ids ids = { }; + struct nvme_id_ns_cs_indep *id; struct nvme_ns *ns; + bool ready = true; if (nvme_identify_ns_descs(ctrl, nsid, &ids)) return; + /* + * Check if the namespace is ready. If not ignore it, we will get an + * AEN once it becomes ready and restart the scan. + */ + if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) && + !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) { + ready = id->nstat & NVME_NSTAT_NRDY; + kfree(id); + } + + if (!ready) + return; + ns = nvme_find_get_ns(ctrl, nsid); if (ns) { nvme_validate_ns(ns, &ids); @@ -4239,11 +4299,26 @@ static void nvme_scan_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, scan_work); + int ret; /* No tagset on a live ctrl means IO queues could not created */ if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) return; + /* + * Identify controller limits can change at controller reset due to + * new firmware download, even though it is not common we cannot ignore + * such scenario. Controller's non-mdts limits are reported in the unit + * of logical blocks that is dependent on the format of attached + * namespace. Hence re-read the limits at the time of ns allocation. + */ + ret = nvme_init_non_mdts_limits(ctrl); + if (ret < 0) { + dev_warn(ctrl->device, + "reading non-mdts-limits failed: %d\n", ret); + return; + } + if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { dev_info(ctrl->device, "rescanning namespaces.\n"); nvme_clear_changed_ns_log(ctrl); @@ -4841,6 +4916,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) != + NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 1e3a09cad961..46d6e194ac2b 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -187,6 +187,14 @@ static inline char *nvmf_ctrl_subsysnqn(struct nvme_ctrl *ctrl) return ctrl->subsys->subnqn; } +static inline void nvmf_complete_timed_out_request(struct request *rq) +{ + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } +} + int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 080f85f4105f..7ae72c7a211b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3831,6 +3831,9 @@ process_local_list: return count; } +static DEVICE_ATTR(nvme_discovery, 0200, NULL, nvme_fc_nvme_discovery_store); + +#ifdef CONFIG_BLK_CGROUP_FC_APPID /* Parse the cgroup id from a buf and return the length of cgrpid */ static int fc_parse_cgrpid(const char *buf, u64 *id) { @@ -3854,12 +3857,10 @@ static int fc_parse_cgrpid(const char *buf, u64 *id) } /* - * fc_update_appid: Parse and update the appid in the blkcg associated with - * cgroupid. - * @buf: buf contains both cgrpid and appid info - * @count: size of the buffer + * Parse and update the appid in the blkcg associated with the cgroupid. */ -static int fc_update_appid(const char *buf, size_t count) +static ssize_t fc_appid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { u64 cgrp_id; int appid_len = 0; @@ -3887,23 +3888,14 @@ static int fc_update_appid(const char *buf, size_t count) return ret; return count; } - -static ssize_t fc_appid_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - int ret = 0; - - ret = fc_update_appid(buf, count); - if (ret < 0) - return -EINVAL; - return count; -} -static DEVICE_ATTR(nvme_discovery, 0200, NULL, nvme_fc_nvme_discovery_store); static DEVICE_ATTR(appid_store, 0200, NULL, fc_appid_store); +#endif /* CONFIG_BLK_CGROUP_FC_APPID */ static struct attribute *nvme_fc_attrs[] = { &dev_attr_nvme_discovery.attr, +#ifdef CONFIG_BLK_CGROUP_FC_APPID &dev_attr_appid_store.attr, +#endif NULL }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 26d35c557588..9b72b6ecf33c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -284,6 +284,7 @@ struct nvme_ctrl { #endif u16 crdt[3]; u16 oncs; + u32 dmrsl; u16 oacs; u16 sqsize; u32 max_namespaces; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3aacf1c0d5a5..5a98a7de0964 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1439,6 +1439,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) nvme_init_request(abort_req, &cmd); abort_req->end_io_data = NULL; + abort_req->rq_flags |= RQF_QUIET; blk_execute_rq_nowait(abort_req, false, abort_endio); /* @@ -1775,6 +1776,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); if (IS_ERR(dev->ctrl.admin_q)) { blk_mq_free_tag_set(&dev->admin_tagset); + dev->ctrl.admin_q = NULL; return -ENOMEM; } if (!blk_get_queue(dev->ctrl.admin_q)) { @@ -2486,6 +2488,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); + req->rq_flags |= RQF_QUIET; blk_execute_rq_nowait(req, false, opcode == nvme_admin_delete_cq ? nvme_del_cq_end : nvme_del_queue_end); return 0; @@ -2675,7 +2678,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) struct pci_dev *pdev = to_pci_dev(dev->dev); mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(pdev)) { + if (pci_device_is_present(pdev) && pci_is_enabled(pdev)) { u32 csts = readl(dev->bar + NVME_REG_CSTS); if (dev->ctrl.state == NVME_CTRL_LIVE || diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index d9f19d901313..b87c8ae41d9b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2010,10 +2010,7 @@ static void nvme_rdma_complete_timed_out(struct request *rq) struct nvme_rdma_queue *queue = req->queue; nvme_rdma_stop_queue(queue); - if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { - nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; - blk_mq_complete_request(rq); - } + nvmf_complete_timed_out_request(rq); } static enum blk_eh_timer_return diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index ad3a2bf2f1e9..bb67538d241b 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2318,10 +2318,7 @@ static void nvme_tcp_complete_timed_out(struct request *rq) struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); - if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { - nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; - blk_mq_complete_request(rq); - } + nvmf_complete_timed_out_request(rq); } static enum blk_eh_timer_return diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index d886c2c59554..27a72504d31c 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -360,7 +360,7 @@ static u16 nvmet_bdev_discard_range(struct nvmet_req *req, ret = __blkdev_issue_discard(ns->bdev, nvmet_lba_to_sect(ns, range->slba), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), - GFP_KERNEL, 0, bio); + GFP_KERNEL, bio); if (ret && ret != -EOPNOTSUPP) { req->error_slba = le64_to_cpu(range->slba); return errno_to_nvme_status(req, ret); diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index e34718b09550..82b61acf7a72 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -34,8 +34,7 @@ static int validate_conv_zones_cb(struct blk_zone *z, bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) { - struct request_queue *q = ns->bdev->bd_disk->queue; - u8 zasl = nvmet_zasl(queue_max_zone_append_sectors(q)); + u8 zasl = nvmet_zasl(bdev_max_zone_append_sectors(ns->bdev)); struct gendisk *bd_disk = ns->bdev->bd_disk; int ret; diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index e084f4dedddd..60be7f7bf2d1 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -782,7 +782,6 @@ static void dasd_fba_setup_blk_queue(struct dasd_block *block) blk_queue_segment_boundary(q, PAGE_SIZE - 1); q->limits.discard_granularity = logical_block_size; - q->limits.discard_alignment = PAGE_SIZE; /* Calculate max_discard_sectors and make it PAGE aligned */ max_bytes = USHRT_MAX * logical_block_size; @@ -791,7 +790,6 @@ static void dasd_fba_setup_blk_queue(struct dasd_block *block) blk_queue_max_discard_sectors(q, max_discard_sectors); blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } static int dasd_fba_pe_handler(struct dasd_device *device, diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index ba9dbb51b75f..f6b83853f7ee 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -5528,7 +5528,9 @@ static char *lpfc_is_command_vm_io(struct scsi_cmnd *cmd) { struct bio *bio = scsi_cmd_to_rq(cmd)->bio; - return bio ? blkcg_get_fc_appid(bio) : NULL; + if (!IS_ENABLED(CONFIG_BLK_CGROUP_FC_APPID) || !bio) + return NULL; + return blkcg_get_fc_appid(bio); } /** diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index dc6e55761fd1..9694e2cfaf9a 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -797,7 +797,6 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) case SD_LBP_FULL: case SD_LBP_DISABLE: blk_queue_max_discard_sectors(q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); return; case SD_LBP_UNMAP: @@ -830,7 +829,6 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) } blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 44bb380e7390..25f33eb25337 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -829,28 +829,26 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) } /* - * Check if the underlying struct block_device request_queue supports - * the QUEUE_FLAG_DISCARD bit for UNMAP/WRITE_SAME in SCSI + TRIM - * in ATA and we need to set TPE=1 + * Check if the underlying struct block_device supports discard and if yes + * configure the UNMAP parameters. */ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib, - struct request_queue *q) + struct block_device *bdev) { - int block_size = queue_logical_block_size(q); + int block_size = bdev_logical_block_size(bdev); - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return false; attrib->max_unmap_lba_count = - q->limits.max_discard_sectors >> (ilog2(block_size) - 9); + bdev_max_discard_sectors(bdev) >> (ilog2(block_size) - 9); /* * Currently hardcoded to 1 in Linux/SCSI code.. */ attrib->max_unmap_block_desc_count = 1; - attrib->unmap_granularity = q->limits.discard_granularity / block_size; - attrib->unmap_granularity_alignment = q->limits.discard_alignment / - block_size; - attrib->unmap_zeroes_data = !!(q->limits.max_write_zeroes_sectors); + attrib->unmap_granularity = bdev_discard_granularity(bdev) / block_size; + attrib->unmap_granularity_alignment = + bdev_discard_alignment(bdev) / block_size; return true; } EXPORT_SYMBOL(target_configure_unmap_from_queue); diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 8190b840065f..e68f1cc8ef98 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -134,10 +134,10 @@ static int fd_configure_device(struct se_device *dev) */ inode = file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { - struct request_queue *q = bdev_get_queue(I_BDEV(inode)); + struct block_device *bdev = I_BDEV(inode); unsigned long long dev_size; - fd_dev->fd_block_size = bdev_logical_block_size(I_BDEV(inode)); + fd_dev->fd_block_size = bdev_logical_block_size(bdev); /* * Determine the number of bytes from i_size_read() minus * one (1) logical sector from underlying struct block_device @@ -150,7 +150,7 @@ static int fd_configure_device(struct se_device *dev) dev_size, div_u64(dev_size, fd_dev->fd_block_size), fd_dev->fd_block_size); - if (target_configure_unmap_from_queue(&dev->dev_attrib, q)) + if (target_configure_unmap_from_queue(&dev->dev_attrib, bdev)) pr_debug("IFILE: BLOCK Discard support available," " disabled by default\n"); /* @@ -159,7 +159,7 @@ static int fd_configure_device(struct se_device *dev) */ dev->dev_attrib.max_write_same_len = 0xFFFF; - if (blk_queue_nonrot(q)) + if (bdev_nonrot(bdev)) dev->dev_attrib.is_nonrot = 1; } else { if (!(fd_dev->fbd_flags & FBDF_HAS_SIZE)) { @@ -558,7 +558,7 @@ fd_execute_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb) ret = blkdev_issue_discard(bdev, target_to_linux_sector(dev, lba), target_to_linux_sector(dev, nolb), - GFP_KERNEL, 0); + GFP_KERNEL); if (ret < 0) { pr_warn("FILEIO: blkdev_issue_discard() failed: %d\n", ret); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 87ede165ddba..378c80313a0f 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -119,7 +119,7 @@ static int iblock_configure_device(struct se_device *dev) dev->dev_attrib.hw_max_sectors = queue_max_hw_sectors(q); dev->dev_attrib.hw_queue_depth = q->nr_requests; - if (target_configure_unmap_from_queue(&dev->dev_attrib, q)) + if (target_configure_unmap_from_queue(&dev->dev_attrib, bd)) pr_debug("IBLOCK: BLOCK Discard support available," " disabled by default\n"); @@ -133,7 +133,7 @@ static int iblock_configure_device(struct se_device *dev) else dev->dev_attrib.max_write_same_len = 0xFFFF; - if (blk_queue_nonrot(q)) + if (bdev_nonrot(bd)) dev->dev_attrib.is_nonrot = 1; bi = bdev_get_integrity(bd); @@ -434,7 +434,7 @@ iblock_execute_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb) ret = blkdev_issue_discard(bdev, target_to_linux_sector(dev, lba), target_to_linux_sector(dev, nolb), - GFP_KERNEL, 0); + GFP_KERNEL); if (ret < 0) { pr_err("blkdev_issue_discard() failed: %d\n", ret); return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -727,17 +727,16 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, if (data_direction == DMA_TO_DEVICE) { struct iblock_dev *ib_dev = IBLOCK_DEV(dev); - struct request_queue *q = bdev_get_queue(ib_dev->ibd_bd); /* * Force writethrough using REQ_FUA if a volatile write cache * is not enabled, or if initiator set the Force Unit Access bit. */ opf = REQ_OP_WRITE; miter_dir = SG_MITER_TO_SG; - if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) { + if (bdev_fua(ib_dev->ibd_bd)) { if (cmd->se_cmd_flags & SCF_FUA) opf |= REQ_FUA; - else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + else if (!bdev_write_cache(ib_dev->ibd_bd)) opf |= REQ_FUA; } } else { @@ -886,11 +885,7 @@ iblock_parse_cdb(struct se_cmd *cmd) static bool iblock_get_write_cache(struct se_device *dev) { - struct iblock_dev *ib_dev = IBLOCK_DEV(dev); - struct block_device *bd = ib_dev->ibd_bd; - struct request_queue *q = bdev_get_queue(bd); - - return test_bit(QUEUE_FLAG_WC, &q->queue_flags); + return bdev_write_cache(IBLOCK_DEV(dev)->ibd_bd); } static const struct target_backend_ops iblock_ops = { diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 60dafe4c581b..bb3fb18b2316 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -818,24 +818,8 @@ static ssize_t pscsi_show_configfs_dev_params(struct se_device *dev, char *b) static void pscsi_bi_endio(struct bio *bio) { - bio_put(bio); -} - -static inline struct bio *pscsi_get_bio(int nr_vecs) -{ - struct bio *bio; - /* - * Use bio_malloc() following the comment in for bio -> struct request - * in block/blk-core.c:blk_make_request() - */ - bio = bio_kmalloc(GFP_KERNEL, nr_vecs); - if (!bio) { - pr_err("PSCSI: bio_kmalloc() failed\n"); - return NULL; - } - bio->bi_end_io = pscsi_bi_endio; - - return bio; + bio_uninit(bio); + kfree(bio); } static sense_reason_t @@ -878,15 +862,12 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, if (!bio) { new_bio: nr_vecs = bio_max_segs(nr_pages); - /* - * Calls bio_kmalloc() and sets bio->bi_end_io() - */ - bio = pscsi_get_bio(nr_vecs); + bio = bio_kmalloc(nr_vecs, GFP_KERNEL); if (!bio) goto fail; - - if (rw) - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, + rw ? REQ_OP_WRITE : REQ_OP_READ); + bio->bi_end_io = pscsi_bi_endio; pr_debug("PSCSI: Allocated bio: %p," " dir: %s nr_vecs: %d\n", bio, @@ -912,11 +893,6 @@ new_bio: goto fail; } - /* - * Clear the pointer so that another bio will - * be allocated with pscsi_get_bio() above. - */ - bio = NULL; goto new_bio; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 31c3f592e587..84795d831282 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4238,6 +4238,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) */ static void btrfs_end_empty_barrier(struct bio *bio) { + bio_uninit(bio); complete(bio->bi_private); } @@ -4247,7 +4248,7 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* @@ -4260,12 +4261,12 @@ static void write_dev_flush(struct btrfs_device *device) * of simplicity, since this is a debug tool and not meant for use in * non-debug builds. */ - struct request_queue *q = bdev_get_queue(device->bdev); - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (!bdev_write_cache(device->bdev)) return; #endif - bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); + bio_init(bio, device->bdev, NULL, 0, + REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; @@ -4279,7 +4280,7 @@ static void write_dev_flush(struct btrfs_device *device) */ static blk_status_t wait_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6aa92f84f465..6260784e74b5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1239,7 +1239,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (size) { ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) @@ -1256,7 +1256,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (bytes_left) { ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; } @@ -1291,7 +1291,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, &discarded); discarded += src_disc; - } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); } else { ret = 0; @@ -5987,7 +5987,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) *trimmed = 0; /* Discard not supported = nothing to do. */ - if (!blk_queue_discard(bdev_get_queue(device->bdev))) + if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index be6c24577dbe..4b28aaea2702 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -468,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_device *device; - struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; @@ -498,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { - if (!device->bdev) + if (!device->bdev || !bdev_max_discard_sectors(device->bdev)) continue; - q = bdev_get_queue(device->bdev); - if (blk_queue_discard(q)) { - num_devices++; - minlen = min_t(u64, q->limits.discard_granularity, - minlen); - } + num_devices++; + minlen = min_t(u64, bdev_discard_granularity(device->bdev), + minlen); } rcu_read_unlock(); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a8cc736731fd..b6b00338037c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -405,7 +405,6 @@ void btrfs_free_device(struct btrfs_device *device) WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); - bio_put(device->flush_bio); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -643,7 +642,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; device->bdev = bdev; @@ -2706,7 +2705,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -6949,16 +6948,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (!dev) return ERR_PTR(-ENOMEM); - /* - * Preallocate a bio that's always going to be used for flushing device - * barriers and matches the device lifespan - */ - dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); - if (!dev->flush_bio) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f3e28f11cfb6..b11c563d2025 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -121,8 +121,8 @@ struct btrfs_device { /* bytes used on the current transaction */ u64 commit_bytes_used; - /* for sending down flush barriers */ - struct bio *flush_bio; + /* Bio used for flushing device barriers */ + struct bio flush_bio; struct completion flush_wait; /* per-device scrub information */ diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d31b0eda210f..29b54fd9c128 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -350,7 +350,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; - struct request_queue *queue = bdev_get_queue(bdev); unsigned int max_active_zones; unsigned int nactive; sector_t nr_sectors; @@ -410,7 +409,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = queue_max_active_zones(queue); + max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err_in_rcu(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", diff --git a/fs/direct-io.c b/fs/direct-io.c index aef06e607b40..840752006f60 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1115,11 +1115,10 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -static inline ssize_t -do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, - get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) +ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, struct iov_iter *iter, + get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) { unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; @@ -1334,29 +1333,6 @@ fail_dio: kmem_cache_free(dio_cache, dio); return retval; } - -ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, - get_block_t get_block, - dio_iodone_t end_io, dio_submit_t submit_io, - int flags) -{ - /* - * The block device state is needed in the end to finally - * submit everything. Since it's likely to be cache cold - * prefetch it here as first thing to hide some of the - * latency. - * - * Attempt to prefetch the pieces we likely need later. - */ - prefetch(&bdev->bd_disk->part_tbl); - prefetch(bdev->bd_disk->queue); - prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES); - - return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block, - end_io, submit_io, flags); -} - EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 2f5130059236..20d4e47f57ab 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -351,21 +351,20 @@ out: static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg) { - struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(inode->i_sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(inode->i_sb->s_bdev)); ret = exfat_trim_fs(inode, &range); if (ret < 0) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8ca21e7917d1..be0788ecaf20 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -627,13 +627,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) if (opts->allow_utime == (unsigned short)-1) opts->allow_utime = ~opts->fs_dmask & 0022; - if (opts->discard) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - - if (!blk_queue_discard(q)) { - exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard"); - opts->discard = 0; - } + if (opts->discard && !bdev_max_discard_sectors(sb->s_bdev)) { + exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard"); + opts->discard = 0; } sb->s_flags |= SB_NODIRATIME; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ba44fa1be70a..4d1d2326eee9 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1044,7 +1044,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) __u32 flags = 0; unsigned int flush_flags = 0; struct super_block *sb = file_inode(filp)->i_sb; - struct request_queue *q; if (copy_from_user(&flags, (__u32 __user *)arg, sizeof(__u32))) @@ -1065,10 +1064,8 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) return -EINVAL; - q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev); - if (!q) - return -ENXIO; - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) return -EOPNOTSUPP; if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) @@ -1393,14 +1390,13 @@ resizefs_out: case FITRIM: { - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 252c168454c7..ea653d19f9ec 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3498,7 +3498,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } - if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) + if (bdev_nonrot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; @@ -3629,7 +3629,7 @@ static inline int ext4_issue_discard(struct super_block *sb, return __blkdev_issue_discard(sb->s_bdev, (sector_t)discard_block << (sb->s_blocksize_bits - 9), (sector_t)count << (sb->s_blocksize_bits - 9), - GFP_NOFS, 0, biop); + GFP_NOFS, biop); } else return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } @@ -6455,7 +6455,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; @@ -6475,9 +6475,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) range->len < sb->s_blocksize) return -EINVAL; /* No point to try to trim less than discard granularity */ - if (range->minlen < q->limits.discard_granularity) { + if (range->minlen < discard_granularity) { minlen = EXT4_NUM_B2C(EXT4_SB(sb), - q->limits.discard_granularity >> sb->s_blocksize_bits); + discard_granularity >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1466fbdbc8e3..6900da973ce2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5474,13 +5474,9 @@ no_journal: goto failed_mount9; } - if (test_opt(sb, DISCARD)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - ext4_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8c570de21ed5..2b2b3c87e45e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4372,8 +4372,7 @@ static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) static inline bool f2fs_bdev_support_discard(struct block_device *bdev) { - return blk_queue_discard(bdev_get_queue(bdev)) || - bdev_is_zoned(bdev); + return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev); } static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5b89af0f27f0..35b6c720c2bc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2285,7 +2285,6 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret; @@ -2304,7 +2303,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) return ret; range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); ret = f2fs_trim_fs(F2FS_SB(sb), &range); mnt_drop_write_file(filp); if (ret < 0) @@ -3686,18 +3685,18 @@ out: static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, pgoff_t off, block_t block, block_t len, u32 flags) { - struct request_queue *q = bdev_get_queue(bdev); sector_t sector = SECTOR_FROM_BLOCK(block); sector_t nr_sects = SECTOR_FROM_BLOCK(len); int ret = 0; - if (!q) - return -ENXIO; - - if (flags & F2FS_TRIM_FILE_DISCARD) - ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS, - blk_queue_secure_erase(q) ? - BLKDEV_DISCARD_SECURE : 0); + if (flags & F2FS_TRIM_FILE_DISCARD) { + if (bdev_max_secure_erase_sectors(bdev)) + ret = blkdev_issue_secure_erase(bdev, sector, nr_sects, + GFP_NOFS); + else + ret = blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS); + } if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { if (IS_ENCRYPTED(inode)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bd9731cdec56..7225ce09f3ab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1196,9 +1196,8 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, unsigned int *issued) { struct block_device *bdev = dc->bdev; - struct request_queue *q = bdev_get_queue(bdev); unsigned int max_discard_blocks = - SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); @@ -1245,7 +1244,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), - GFP_NOFS, 0, &bio); + GFP_NOFS, &bio); submit: if (err) { spin_lock_irqsave(&dc->lock, flags); @@ -1375,9 +1374,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; - struct request_queue *q = bdev_get_queue(bdev); unsigned int max_discard_blocks = - SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, diff --git a/fs/fat/file.c b/fs/fat/file.c index a5a309fcc7fa..bf91f977debe 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -127,13 +127,12 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) struct super_block *sb = inode->i_sb; struct fstrim_range __user *user_range; struct fstrim_range range; - struct request_queue *q = bdev_get_queue(sb->s_bdev); int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; @@ -141,7 +140,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); err = fat_trim_fs(inode, &range); if (err < 0) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bf6051bdf1d1..3d1afb95a925 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1872,13 +1872,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, goto out_fail; } - if (sbi->options.discard) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - fat_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + if (sbi->options.discard && !bdev_max_discard_sectors(sb->s_bdev)) + fat_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); fat_set_state(sb, 1, 0); return 0; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1fae0196292a..a1074a26e784 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1779,11 +1779,12 @@ static long writeback_sb_inodes(struct super_block *sb, }; unsigned long start_time = jiffies; long write_chunk; - long wrote = 0; /* count both pages and inodes */ + long total_wrote = 0; /* count both pages and inodes */ while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; + long wrote; if (inode->i_sb != sb) { if (work->sb) { @@ -1859,7 +1860,9 @@ static long writeback_sb_inodes(struct super_block *sb, wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; + wrote = wrote < 0 ? 0 : wrote; + total_wrote += wrote; if (need_resched()) { /* @@ -1881,7 +1884,7 @@ static long writeback_sb_inodes(struct super_block *sb, tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) - wrote++; + total_wrote++; requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); @@ -1895,14 +1898,14 @@ static long writeback_sb_inodes(struct super_block *sb, * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ - if (wrote) { + if (total_wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } - return wrote; + return total_wrote; } static long __writeback_inodes_wb(struct bdi_writeback *wb, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 801ad9f4f2be..6d26bb525484 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1386,7 +1386,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) { struct inode *inode = file_inode(filp); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); + struct block_device *bdev = sdp->sd_vfs->s_bdev; struct buffer_head *bh; struct gfs2_rgrpd *rgd; struct gfs2_rgrpd *rgd_end; @@ -1405,7 +1405,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) return -EROFS; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(&r, argp, sizeof(r))) @@ -1418,8 +1418,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) start = r.start >> bs_shift; end = start + (r.len >> bs_shift); minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); - minlen = max_t(u64, minlen, - q->limits.discard_granularity) >> bs_shift; + minlen = max_t(u64, minlen, bdev_discard_granularity(bdev)) >> bs_shift; if (end <= start || minlen > sdp->sd_max_rg_data) return -EINVAL; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b08f5dc31780..80f9b047aa1b 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -56,7 +56,8 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, { atomic_inc(&dio->ref); - if (dio->iocb->ki_flags & IOCB_HIPRI) { + /* Sync dio can't be polled reliably */ + if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) { bio_set_polled(bio, dio->iocb); dio->submit.poll_bio = bio; } @@ -265,8 +266,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * cache flushes on IO completion. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_FUA) && - blk_queue_fua(bdev_get_queue(iomap->bdev))) + (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev)) use_fua = true; } @@ -654,9 +654,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->submit.waiter)) break; - if (!dio->submit.poll_bio || - !bio_poll(dio->submit.poll_bio, NULL, 0)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fcacafa4510d..c0cbeeaec2d1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1762,7 +1762,6 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) unsigned long block, log_offset; /* logical */ unsigned long long phys_block, block_start, block_stop; /* physical */ loff_t byte_start, byte_stop, byte_count; - struct request_queue *q = bdev_get_queue(journal->j_dev); /* flags must be set to either discard or zeroout */ if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || @@ -1770,10 +1769,8 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) return -EINVAL; - if (!q) - return -ENXIO; - - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(journal->j_dev)) return -EOPNOTSUPP; /* @@ -1828,7 +1825,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) err = blkdev_issue_discard(journal->j_dev, byte_start >> SECTOR_SHIFT, byte_count >> SECTOR_SHIFT, - GFP_NOFS, 0); + GFP_NOFS); } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { err = blkdev_issue_zeroout(journal->j_dev, byte_start >> SECTOR_SHIFT, diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 03a845ab4f00..1e7b177ece60 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -110,14 +110,13 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; s64 ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) { + if (!bdev_max_discard_sectors(sb->s_bdev)) { jfs_warn("FITRIM not supported on device"); return -EOPNOTSUPP; } @@ -127,7 +126,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); ret = jfs_ioc_trim(inode, &range); if (ret < 0) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index f1a13a74cddf..85d4f44f2ac4 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -372,19 +372,16 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_discard: - { - struct request_queue *q = bdev_get_queue(sb->s_bdev); /* if set to 1, even copying files will cause * trimming :O * -> user has more control over the online trimming */ sbi->minblks_trim = 64; - if (blk_queue_discard(q)) + if (bdev_max_discard_sectors(sb->s_bdev)) *flag |= JFS_DISCARD; else pr_err("JFS: discard option not supported on device\n"); break; - } case Opt_nodiscard: *flag &= ~JFS_DISCARD; @@ -392,10 +389,9 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_discard_minblk: { - struct request_queue *q = bdev_get_queue(sb->s_bdev); char *minblks_trim = args[0].from; int rc; - if (blk_queue_discard(q)) { + if (bdev_max_discard_sectors(sb->s_bdev)) { *flag |= JFS_DISCARD; rc = kstrtouint(minblks_trim, 0, &sbi->minblks_trim); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index fec194a666f4..87e1004b606d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1052,20 +1052,20 @@ out: static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; - struct request_queue *q = bdev_get_queue(nilfs->ns_bdev); struct fstrim_range range; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(nilfs->ns_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; - range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity); + range.minlen = max_t(u64, range.minlen, + bdev_discard_granularity(nilfs->ns_bdev)); down_read(&nilfs->ns_segctor_sem); ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range); diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index e385cca2004a..77ff8e95421f 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1100,7 +1100,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (ret < 0) { put_bh(su_bh); goto out_sem; @@ -1134,7 +1134,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) ndiscarded += nblocks; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index dd48a8f74d57..3b4a079c9617 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -672,7 +672,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (ret < 0) return ret; nblocks = 0; @@ -682,7 +682,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); return ret; } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 787b53b984ee..15806eeae217 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -22,20 +22,20 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) { struct fstrim_range __user *user_range; struct fstrim_range range; - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sbi->sb->s_bdev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; if (copy_from_user(&range, user_range, sizeof(range))) return -EFAULT; - range.minlen = max_t(u32, range.minlen, q->limits.discard_granularity); + range.minlen = max_t(u32, range.minlen, + bdev_discard_granularity(sbi->sb->s_bdev)); err = ntfs_trim_fs(sbi, &range); if (err < 0) diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 278dcf502410..5781b9e8e3d8 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -882,7 +882,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) int err; struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; - struct request_queue *rq; struct inode *inode; struct ntfs_inode *ni; size_t i, tt; @@ -912,15 +911,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) goto out; } - rq = bdev_get_queue(bdev); - if (blk_queue_discard(rq) && rq->limits.discard_granularity) { - sbi->discard_granularity = rq->limits.discard_granularity; + if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) { + sbi->discard_granularity = bdev_discard_granularity(bdev); sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } /* Parse boot. */ - err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512, + err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev), bdev_nr_bytes(bdev)); if (err) goto out; @@ -1335,7 +1333,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) return 0; err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (err == -EOPNOTSUPP) sbi->flags |= NTFS_FLAGS_NODISCARD; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index f59461d85da4..afd54ec66103 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -903,20 +903,19 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; - range.minlen = max_t(u64, q->limits.discard_granularity, + range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev), range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 622c844f6d11..8879d052f96c 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -86,17 +86,10 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - if (page_count <= BIO_MAX_VECS) { - bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO); - } else { - bio = bio_kmalloc(GFP_NOIO, page_count); - bio_set_dev(bio, sb->s_bdev); - bio->bi_opf = REQ_OP_READ; - } - + bio = bio_kmalloc(page_count, GFP_NOIO); if (!bio) return -ENOMEM; - + bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ); bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { @@ -126,7 +119,8 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, out_free_bio: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return error; } @@ -190,7 +184,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, length |= data[0] << 8; } bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); @@ -224,7 +219,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, out_free_bio: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); out: if (res < 0) { ERROR("Failed to read block 0x%llx: %d\n", index, res); diff --git a/fs/super.c b/fs/super.c index f1d4a193602d..60f57c7bc0a6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1204,7 +1204,7 @@ static int set_bdev_super(struct super_block *s, void *data) s->s_dev = s->s_bdev->bd_dev; s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); - if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) + if (bdev_stable_writes(s->s_bdev)) s->s_iflags |= SB_I_STABLE_WRITES; return 0; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 0191de8ce9ce..c6fe3f6ebb6b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -114,7 +114,7 @@ xfs_trim_extents( } trace_xfs_discard_extent(mp, agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); + error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); if (error) goto out_del_cursor; *blocks_trimmed += flen; @@ -152,8 +152,8 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); - unsigned int granularity = q->limits.discard_granularity; + unsigned int granularity = + bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); struct fstrim_range range; xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; @@ -162,7 +162,7 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) return -EOPNOTSUPP; /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ba57323bfdce..c9f55e4f0957 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -605,7 +605,7 @@ xlog_discard_busy_extents( error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, 0, &bio); + GFP_NOFS, &bio); if (error && error != -EOPNOTSUPP) { xfs_info(mp, "discard failed for extent [0x%llx,%u], error %d", diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 54be9d64093e..a276b8111f63 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1608,14 +1608,10 @@ xfs_fs_fill_super( goto out_filestream_unmount; } - if (xfs_has_discard(mp)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - - if (!blk_queue_discard(q)) { - xfs_warn(mp, "mounting with \"discard\" option, but " - "the device does not support discard"); - mp->m_features &= ~XFS_FEAT_DISCARD; - } + if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) { + xfs_warn(mp, + "mounting with \"discard\" option, but the device does not support discard"); + mp->m_features &= ~XFS_FEAT_DISCARD; } if (xfs_has_reflink(mp)) { diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e20e7c841489..82da583da17b 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -689,13 +689,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct zonefs_inode_info *zi = ZONEFS_I(inode); struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max; + unsigned int max = bdev_max_zone_append_sectors(bdev); struct bio *bio; ssize_t size; int nr_pages; ssize_t ret; - max = queue_max_zone_append_sectors(bdev_get_queue(bdev)); max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); iov_iter_truncate(from, max); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 87ce24d238f3..2bd073fa6bb5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -17,8 +17,6 @@ #include #include -struct blkcg; - static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); @@ -154,7 +152,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); -void wb_blkcg_offline(struct blkcg *blkcg); +void wb_blkcg_offline(struct cgroup_subsys_state *css); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode @@ -378,7 +376,7 @@ static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } -static inline void wb_blkcg_offline(struct blkcg *blkcg) +static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { } diff --git a/include/linux/bio.h b/include/linux/bio.h index 00450fd86bb4..1cf3738ef1ea 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,9 +408,7 @@ extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask, struct bio_set *bs); -struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, - unsigned short nr_vecs, unsigned int opf, struct bio_set *bs); -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); +struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask); extern void bio_put(struct bio *); struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, @@ -785,6 +783,12 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) bio->bi_opf |= REQ_NOWAIT; } +static inline void bio_clear_polled(struct bio *bio) +{ + /* can't support alloc cache if we turn off polling */ + bio->bi_opf &= ~(REQ_POLLED | REQ_ALLOC_CACHE); +} + struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, unsigned int opf, gfp_t gfp); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 652cd05b0924..9f40dbc65f82 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -14,265 +14,39 @@ * Nauman Rafique */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include + +struct bio; +struct cgroup_subsys_state; +struct request_queue; #define FC_APPID_LEN 129 #ifdef CONFIG_BLK_CGROUP - -enum blkg_iostat_type { - BLKG_IOSTAT_READ, - BLKG_IOSTAT_WRITE, - BLKG_IOSTAT_DISCARD, - - BLKG_IOSTAT_NR, -}; - -struct blkcg_gq; -struct blkg_policy_data; - -struct blkcg { - struct cgroup_subsys_state css; - spinlock_t lock; - refcount_t online_pin; - - struct radix_tree_root blkg_tree; - struct blkcg_gq __rcu *blkg_hint; - struct hlist_head blkg_list; - - struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; - - struct list_head all_blkcgs_node; -#ifdef CONFIG_BLK_CGROUP_FC_APPID - char fc_app_id[FC_APPID_LEN]; -#endif -#ifdef CONFIG_CGROUP_WRITEBACK - struct list_head cgwb_list; -#endif -}; - -struct blkg_iostat { - u64 bytes[BLKG_IOSTAT_NR]; - u64 ios[BLKG_IOSTAT_NR]; -}; - -struct blkg_iostat_set { - struct u64_stats_sync sync; - struct blkg_iostat cur; - struct blkg_iostat last; -}; - -/* association between a blk cgroup and a request queue */ -struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; - struct hlist_node blkcg_node; - struct blkcg *blkcg; - - /* all non-root blkcg_gq's are guaranteed to have access to parent */ - struct blkcg_gq *parent; - - /* reference count */ - struct percpu_ref refcnt; - - /* is this blkg online? protected by both blkcg and q locks */ - bool online; - - struct blkg_iostat_set __percpu *iostat_cpu; - struct blkg_iostat_set iostat; - - struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - - spinlock_t async_bio_lock; - struct bio_list async_bios; - union { - struct work_struct async_bio_work; - struct work_struct free_work; - }; - - atomic_t use_delay; - atomic64_t delay_nsec; - atomic64_t delay_start; - u64 last_delay; - int last_use; - - struct rcu_head rcu_head; -}; - extern struct cgroup_subsys_state * const blkcg_root_css; -void blkcg_destroy_blkgs(struct blkcg *blkcg); void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); void blkcg_maybe_throttle_current(void); - -static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct blkcg, css) : NULL; -} - -/** - * bio_blkcg - grab the blkcg associated with a bio - * @bio: target bio - * - * This returns the blkcg associated with a bio, %NULL if not associated. - * Callers are expected to either handle %NULL or know association has been - * done prior to calling this. - */ -static inline struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return NULL; -} - -static inline bool blk_cgroup_congested(void) -{ - struct cgroup_subsys_state *css; - bool ret = false; - - rcu_read_lock(); - css = kthread_blkcg(); - if (!css) - css = task_css(current, io_cgrp_id); - while (css) { - if (atomic_read(&css->cgroup->congestion_count)) { - ret = true; - break; - } - css = css->parent; - } - rcu_read_unlock(); - return ret; -} - -/** - * blkcg_parent - get the parent of a blkcg - * @blkcg: blkcg of interest - * - * Return the parent blkcg of @blkcg. Can be called anytime. - */ -static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) -{ - return css_to_blkcg(blkcg->css.parent); -} - -/** - * blkcg_pin_online - pin online state - * @blkcg: blkcg of interest - * - * While pinned, a blkcg is kept online. This is primarily used to - * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline - * while an associated cgwb is still active. - */ -static inline void blkcg_pin_online(struct blkcg *blkcg) -{ - refcount_inc(&blkcg->online_pin); -} - -/** - * blkcg_unpin_online - unpin online state - * @blkcg: blkcg of interest - * - * This is primarily used to impedance-match blkg and cgwb lifetimes so - * that blkg doesn't go offline while an associated cgwb is still active. - * When this count goes to zero, all active cgwbs have finished so the - * blkcg can continue destruction by calling blkcg_destroy_blkgs(). - */ -static inline void blkcg_unpin_online(struct blkcg *blkcg) -{ - do { - if (!refcount_dec_and_test(&blkcg->online_pin)) - break; - blkcg_destroy_blkgs(blkcg); - blkcg = blkcg_parent(blkcg); - } while (blkcg); -} +bool blk_cgroup_congested(void); +void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); +void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css); +struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio); #else /* CONFIG_BLK_CGROUP */ -struct blkcg { -}; - -struct blkcg_gq { -}; - #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) static inline void blkcg_maybe_throttle_current(void) { } static inline bool blk_cgroup_congested(void) { return false; } - -#ifdef CONFIG_BLOCK static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } -#endif /* CONFIG_BLOCK */ - -#endif /* CONFIG_BLK_CGROUP */ - -#ifdef CONFIG_BLK_CGROUP_FC_APPID -/* - * Sets the fc_app_id field associted to blkcg - * @app_id: application identifier - * @cgrp_id: cgroup id - * @app_id_len: size of application identifier - */ -static inline int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len) +static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { - struct cgroup *cgrp; - struct cgroup_subsys_state *css; - struct blkcg *blkcg; - int ret = 0; - - if (app_id_len > FC_APPID_LEN) - return -EINVAL; - - cgrp = cgroup_get_from_id(cgrp_id); - if (!cgrp) - return -ENOENT; - css = cgroup_get_e_css(cgrp, &io_cgrp_subsys); - if (!css) { - ret = -ENOENT; - goto out_cgrp_put; - } - blkcg = css_to_blkcg(css); - /* - * There is a slight race condition on setting the appid. - * Worst case an I/O may not find the right id. - * This is no different from the I/O we let pass while obtaining - * the vmid from the fabric. - * Adding the overhead of a lock is not necessary. - */ - strlcpy(blkcg->fc_app_id, app_id, app_id_len); - css_put(css); -out_cgrp_put: - cgroup_put(cgrp); - return ret; -} - -/** - * blkcg_get_fc_appid - get the fc app identifier associated with a bio - * @bio: target bio - * - * On success return the fc_app_id, on failure return NULL - */ -static inline char *blkcg_get_fc_appid(struct bio *bio) -{ - if (bio && bio->bi_blkg && - (bio->bi_blkg->blkcg->fc_app_id[0] != '\0')) - return bio->bi_blkg->blkcg->fc_app_id; return NULL; } -#else -static inline int blkcg_set_fc_appid(char *buf, u64 id, size_t len) { return -EINVAL; } -static inline char *blkcg_get_fc_appid(struct bio *bio) { return NULL; } -#endif /*CONFIG_BLK_CGROUP_FC_APPID*/ +#endif /* CONFIG_BLK_CGROUP */ + +int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len); +char *blkcg_get_fc_appid(struct bio *bio); + #endif /* _BLK_CGROUP_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3a0eb069a6a1..de29453f54b0 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -44,7 +44,7 @@ struct block_device { unsigned long bd_stamp; bool bd_read_only; /* read-only policy */ dev_t bd_dev; - int bd_openers; + atomic_t bd_openers; struct inode * bd_inode; /* will die */ struct super_block * bd_super; void * bd_claiming; @@ -246,9 +246,8 @@ typedef unsigned int blk_qc_t; struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - unsigned int bi_opf; /* bottom bits req flags, - * top bits REQ_OP. Use - * accessors. + unsigned int bi_opf; /* bottom bits REQ_OP, top bits + * req_flags. */ unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; @@ -332,7 +331,6 @@ enum { BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ - BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */ BIO_FLAG_LAST }; @@ -412,15 +410,17 @@ enum req_flag_bits { * work item to avoid such priority inversions. */ __REQ_CGROUP_PUNT, + __REQ_POLLED, /* caller polls for completion using bio_poll */ + __REQ_ALLOC_CACHE, /* allocate IO from cache if available */ + __REQ_SWAP, /* swap I/O */ + __REQ_DRV, /* for driver use */ - /* command specific flags for REQ_OP_WRITE_ZEROES: */ + /* + * Command specific flags, keep last: + */ + /* for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ - __REQ_POLLED, /* caller polls for completion using bio_poll */ - - /* for driver use */ - __REQ_DRV, - __REQ_SWAP, /* swapping request. */ __REQ_NR_BITS, /* stops here */ }; @@ -442,6 +442,7 @@ enum req_flag_bits { #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_POLLED (1ULL << __REQ_POLLED) +#define REQ_ALLOC_CACHE (1ULL << __REQ_ALLOC_CACHE) #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 60d016138997..5bdf2ac9142c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -176,6 +176,21 @@ static inline bool disk_live(struct gendisk *disk) return !inode_unhashed(disk->part0->bd_inode); } +/** + * disk_openers - returns how many openers are there for a disk + * @disk: disk to check + * + * This returns the number of openers for a disk. Note that this value is only + * stable if disk->open_mutex is held. + * + * Note: Due to a quirk in the block layer open code, each open partition is + * only counted once even if there are multiple openers. + */ +static inline unsigned int disk_openers(struct gendisk *disk) +{ + return atomic_read(&disk->part0->bd_openers); +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. @@ -248,6 +263,7 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; + unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; @@ -540,10 +556,8 @@ struct request_queue { #define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ -#define QUEUE_FLAG_DISCARD 8 /* supports DISCARD */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ -#define QUEUE_FLAG_SECERASE 11 /* supports secure erase */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ @@ -582,11 +596,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) -#define blk_queue_secure_erase(q) \ - (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) @@ -602,7 +613,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) -#define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) @@ -950,6 +960,8 @@ extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_discard_segments(struct request_queue *, unsigned short); +void blk_queue_max_secure_erase_sectors(struct request_queue *q, + unsigned int max_sectors); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); @@ -1090,13 +1102,12 @@ static inline long nr_blockdev_pages(void) extern void blk_io_schedule(void); -#define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ - -extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); -extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags, - struct bio **biop); +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask); +int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); +int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ @@ -1115,7 +1126,7 @@ static inline int sb_issue_discard(struct super_block *sb, sector_t block, SECTOR_SHIFT), nr_blocks << (sb->s_blocksize_bits - SECTOR_SHIFT), - gfp_mask, flags); + gfp_mask); } static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask) @@ -1189,6 +1200,12 @@ static inline unsigned int queue_max_zone_append_sectors(const struct request_qu return min(l->max_zone_append_sectors, l->max_sectors); } +static inline unsigned int +bdev_max_zone_append_sectors(struct block_device *bdev) +{ + return queue_max_zone_append_sectors(bdev_get_queue(bdev)); +} + static inline unsigned queue_logical_block_size(const struct request_queue *q) { int retval = 512; @@ -1246,74 +1263,23 @@ bdev_zone_write_granularity(struct block_device *bdev) return queue_zone_write_granularity(bdev_get_queue(bdev)); } -static inline int queue_alignment_offset(const struct request_queue *q) -{ - if (q->limits.misaligned) - return -1; +int bdev_alignment_offset(struct block_device *bdev); +unsigned int bdev_discard_alignment(struct block_device *bdev); - return q->limits.alignment_offset; +static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) +{ + return bdev_get_queue(bdev)->limits.max_discard_sectors; } -static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) +static inline unsigned int bdev_discard_granularity(struct block_device *bdev) { - unsigned int granularity = max(lim->physical_block_size, lim->io_min); - unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) - << SECTOR_SHIFT; - - return (granularity + lim->alignment_offset - alignment) % granularity; + return bdev_get_queue(bdev)->limits.discard_granularity; } -static inline int bdev_alignment_offset(struct block_device *bdev) +static inline unsigned int +bdev_max_secure_erase_sectors(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q->limits.misaligned) - return -1; - if (bdev_is_partition(bdev)) - return queue_limit_alignment_offset(&q->limits, - bdev->bd_start_sect); - return q->limits.alignment_offset; -} - -static inline int queue_discard_alignment(const struct request_queue *q) -{ - if (q->limits.discard_misaligned) - return -1; - - return q->limits.discard_alignment; -} - -static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) -{ - unsigned int alignment, granularity, offset; - - if (!lim->max_discard_sectors) - return 0; - - /* Why are these in bytes, not sectors? */ - alignment = lim->discard_alignment >> SECTOR_SHIFT; - granularity = lim->discard_granularity >> SECTOR_SHIFT; - if (!granularity) - return 0; - - /* Offset of the partition start in 'granularity' sectors */ - offset = sector_div(sector, granularity); - - /* And why do we do this modulus *again* in blkdev_issue_discard()? */ - offset = (granularity + alignment - offset) % granularity; - - /* Turn it back into bytes, gaah */ - return offset << SECTOR_SHIFT; -} - -static inline int bdev_discard_alignment(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - if (bdev_is_partition(bdev)) - return queue_limit_discard_alignment(&q->limits, - bdev->bd_start_sect); - return q->limits.discard_alignment; + return bdev_get_queue(bdev)->limits.max_secure_erase_sectors; } static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) @@ -1326,6 +1292,27 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) return 0; } +static inline bool bdev_nonrot(struct block_device *bdev) +{ + return blk_queue_nonrot(bdev_get_queue(bdev)); +} + +static inline bool bdev_stable_writes(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_STABLE_WRITES, + &bdev_get_queue(bdev)->queue_flags); +} + +static inline bool bdev_write_cache(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags); +} + +static inline bool bdev_fua(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); @@ -1491,9 +1478,10 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } -unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, - unsigned int op); -void disk_end_io_acct(struct gendisk *disk, unsigned int op, +unsigned long bdev_start_io_acct(struct block_device *bdev, + unsigned int sectors, unsigned int op, + unsigned long start_time); +void bdev_end_io_acct(struct block_device *bdev, unsigned int op, unsigned long start_time); void bio_start_io_acct_time(struct bio *bio, unsigned long start_time); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 22501a293fa5..623e22492afa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -27,12 +27,10 @@ struct blk_trace { atomic_t dropped; }; -struct blkcg; - extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern __printf(3, 4) -void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *fmt, ...); +__printf(3, 4) void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...); /** * blk_add_trace_msg - Add a (simple) message to the blktrace stream @@ -47,14 +45,14 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f * NOTE: Can not use 'static inline' due to presence of var args... * **/ -#define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ +#define blk_add_cgroup_trace_msg(q, css, fmt, ...) \ do { \ struct blk_trace *bt; \ \ rcu_read_lock(); \ bt = rcu_dereference((q)->blk_trace); \ if (unlikely(bt)) \ - __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ + __blk_trace_note_message(bt, css, fmt, ##__VA_ARGS__);\ rcu_read_unlock(); \ } while (0) #define blk_add_trace_msg(q, fmt, ...) \ diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 0a89f111e00e..67caa909e3e6 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -77,7 +77,6 @@ struct cdrom_device_ops { int (*tray_move) (struct cdrom_device_info *, int); int (*lock_door) (struct cdrom_device_info *, int); int (*select_speed) (struct cdrom_device_info *, int); - int (*select_disc) (struct cdrom_device_info *, int); int (*get_last_session) (struct cdrom_device_info *, struct cdrom_multisession *); int (*get_mcn) (struct cdrom_device_info *, diff --git a/include/linux/kthread.h b/include/linux/kthread.h index de5d75bafd66..30e5bec81d2b 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -222,9 +222,5 @@ void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); #else static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } -static inline struct cgroup_subsys_state *kthread_blkcg(void) -{ - return NULL; -} #endif #endif /* _LINUX_KTHREAD_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f626a445d1a8..29ec3e3481ff 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -137,6 +137,7 @@ enum { NVME_REG_CMBMSC = 0x0050, /* Controller Memory Buffer Memory * Space Control */ + NVME_REG_CRTO = 0x0068, /* Controller Ready Timeouts */ NVME_REG_PMRCAP = 0x0e00, /* Persistent Memory Capabilities */ NVME_REG_PMRCTL = 0x0e04, /* Persistent Memory Region Control */ NVME_REG_PMRSTS = 0x0e08, /* Persistent Memory Region Status */ @@ -161,6 +162,9 @@ enum { #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) +#define NVME_CRTO_CRIMT(crto) ((crto) >> 16) +#define NVME_CRTO_CRWMT(crto) ((crto) & 0xffff) + enum { NVME_CMBSZ_SQS = 1 << 0, NVME_CMBSZ_CQS = 1 << 1, @@ -204,8 +208,10 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, - NVME_CAP_CSS_NVM = 1 << 0, - NVME_CAP_CSS_CSI = 1 << 6, + NVME_CC_CRIME = 1 << 24, +}; + +enum { NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, @@ -214,10 +220,23 @@ enum { NVME_CSTS_SHST_OCCUR = 1 << 2, NVME_CSTS_SHST_CMPLT = 2 << 2, NVME_CSTS_SHST_MASK = 3 << 2, +}; + +enum { NVME_CMBMSC_CRE = 1 << 0, NVME_CMBMSC_CMSE = 1 << 1, }; +enum { + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI = 1 << 6, +}; + +enum { + NVME_CAP_CRMS_CRIMS = 1ULL << 59, + NVME_CAP_CRMS_CRWMS = 1ULL << 60, +}; + struct nvme_id_power_state { __le16 max_power; /* centiwatts */ __u8 rsvd2; @@ -405,6 +424,21 @@ struct nvme_id_ns { __u8 vs[3712]; }; +/* I/O Command Set Independent Identify Namespace Data Structure */ +struct nvme_id_ns_cs_indep { + __u8 nsfeat; + __u8 nmic; + __u8 rescap; + __u8 fpi; + __le32 anagrpid; + __u8 nsattr; + __u8 rsvd9; + __le16 nvmsetid; + __le16 endgid; + __u8 nstat; + __u8 rsvd15[4081]; +}; + struct nvme_zns_lbafe { __le64 zsze; __u8 zdes; @@ -469,6 +503,7 @@ enum { NVME_ID_CNS_NS_DESC_LIST = 0x03, NVME_ID_CNS_CS_NS = 0x05, NVME_ID_CNS_CS_CTRL = 0x06, + NVME_ID_CNS_NS_CS_INDEP = 0x08, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -522,6 +557,10 @@ enum { NVME_NS_DPS_PI_TYPE3 = 3, }; +enum { + NVME_NSTAT_NRDY = 1 << 0, +}; + enum { NVME_NVM_NS_16B_GUARD = 0, NVME_NVM_NS_32B_GUARD = 1, @@ -1583,6 +1622,7 @@ enum { NVME_SC_NS_WRITE_PROTECTED = 0x20, NVME_SC_CMD_INTERRUPTED = 0x21, NVME_SC_TRANSIENT_TR_ERR = 0x22, + NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY = 0x24, NVME_SC_INVALID_IO_CMD_SET = 0x2C, NVME_SC_LBA_RANGE = 0x80, @@ -1679,9 +1719,11 @@ enum { /* * Path-related Errors: */ + NVME_SC_INTERNAL_PATH_ERROR = 0x300, NVME_SC_ANA_PERSISTENT_LOSS = 0x301, NVME_SC_ANA_INACCESSIBLE = 0x302, NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_CTRL_PATH_ERROR = 0x360, NVME_SC_HOST_PATH_ERROR = 0x370, NVME_SC_HOST_ABORTED_CMD = 0x371, diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index 675f3a1fe613..773963a1e0b5 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -14,7 +14,7 @@ #define TRANSPORT_FLAG_PASSTHROUGH_ALUA 0x2 #define TRANSPORT_FLAG_PASSTHROUGH_PGR 0x4 -struct request_queue; +struct block_device; struct scatterlist; struct target_backend_ops { @@ -117,7 +117,7 @@ sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd, bool target_sense_desc_format(struct se_device *dev); sector_t target_to_linux_sector(struct se_device *dev, sector_t lb); bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib, - struct request_queue *q); + struct block_device *bdev); static inline bool target_dev_configured(struct se_device *se_dev) { diff --git a/include/uapi/linux/cdrom.h b/include/uapi/linux/cdrom.h index 804ff8d98f71..011e594e4a0d 100644 --- a/include/uapi/linux/cdrom.h +++ b/include/uapi/linux/cdrom.h @@ -103,7 +103,7 @@ #define CDROMREADALL 0x5318 /* read all 2646 bytes */ /* - * These ioctls are (now) only in ide-cd.c for controlling + * These ioctls were only in (now removed) ide-cd.c for controlling * drive spindown time. They should be implemented in the * Uniform driver, via generic packet commands, GPCMD_MODE_SELECT_10, * GPCMD_MODE_SENSE_10 and the GPMODE_POWER_PAGE... diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index 98e60801195e..6f63527dd2ed 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -1,11 +1,6 @@ /* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */ /* - * include/linux/loop.h - * - * Written by Theodore Ts'o, 3/29/93. - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. + * Copyright 1993 by Theodore Ts'o. */ #ifndef _UAPI_LINUX_LOOP_H #define _UAPI_LINUX_LOOP_H diff --git a/kernel/kthread.c b/kernel/kthread.c index 8dfab1e825c2..715d012baef2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1523,5 +1523,4 @@ struct cgroup_subsys_state *kthread_blkcg(void) } return NULL; } -EXPORT_SYMBOL(kthread_blkcg); #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4d5629196d01..10a32b0f2deb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, - const char *fmt, ...) +void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...) { int n; va_list args; unsigned long flags; char *buf; + u64 cgid = 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) @@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_id(blkcg->css.cgroup) : 1); -#else - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); + if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + cgid = cgroup_id(css->cgroup); + else + cgid = 1; #endif + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(__trace_note_message); +EXPORT_SYMBOL_GPL(__blk_trace_note_message); static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) @@ -411,7 +411,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, NULL, "%s", msg); + __blk_trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; @@ -783,6 +783,7 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { + struct cgroup_subsys_state *blkcg_css; struct blk_trace *bt; /* We don't use the 'bt' value here except as an optimization... */ @@ -790,9 +791,10 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; - if (!bio->bi_blkg) + blkcg_css = bio_blkcg_css(bio); + if (!blkcg_css) return 0; - return cgroup_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(blkcg_css->cgroup); } #else static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7176af65b103..ff60bd7d74e0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -390,7 +391,6 @@ static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); - struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); @@ -401,7 +401,7 @@ static void cgwb_release_workfn(struct work_struct *work) mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ - blkcg_unpin_online(blkcg); + blkcg_unpin_online(wb->blkcg_css); fprop_local_destroy_percpu(&wb->memcg_completions); @@ -446,7 +446,6 @@ static int cgwb_create(struct backing_dev_info *bdi, { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; - struct blkcg *blkcg; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; @@ -454,9 +453,8 @@ static int cgwb_create(struct backing_dev_info *bdi, memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); - blkcg = css_to_blkcg(blkcg_css); memcg_cgwb_list = &memcg->cgwb_list; - blkcg_cgwb_list = &blkcg->cgwb_list; + blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); @@ -511,7 +509,7 @@ static int cgwb_create(struct backing_dev_info *bdi, list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); - blkcg_pin_online(blkcg); + blkcg_pin_online(blkcg_css); css_get(memcg_css); css_get(blkcg_css); } @@ -724,18 +722,19 @@ void wb_memcg_offline(struct mem_cgroup *memcg) /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined - * @blkcg: blkcg being offlined + * @css: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ -void wb_blkcg_offline(struct blkcg *blkcg) +void wb_blkcg_offline(struct cgroup_subsys_state *css) { struct bdi_writeback *wb, *next; + struct list_head *list = blkcg_get_cgwb_list(css); spin_lock_irq(&cgwb_lock); - list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) + list_for_each_entry_safe(wb, next, list, blkcg_node) cgwb_kill(wb); - blkcg->cgwb_list.next = NULL; /* prevent new wb's */ + list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } diff --git a/mm/page_io.c b/mm/page_io.c index 89fbf3cae30f..3fbdab6a940e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -360,7 +360,6 @@ int swap_readpage(struct page *page, bool synchronous) * attempt to access it in the page fault retry time check. */ if (synchronous) { - bio->bi_opf |= REQ_POLLED; get_task_struct(current); bio->bi_private = current; } @@ -372,8 +371,7 @@ int swap_readpage(struct page *page, bool synchronous) if (!READ_ONCE(bio->bi_private)) break; - if (!bio_poll(bio, NULL, 0)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); bio_put(bio); diff --git a/mm/readahead.c b/mm/readahead.c index 4a60cdb64262..26bf74a6b2fe 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -113,6 +113,7 @@ * ->readpage() which may be less efficient. */ +#include #include #include #include diff --git a/mm/swapfile.c b/mm/swapfile.c index 63c61f8b2611..981a6e85c88e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -6,6 +6,7 @@ * Swap reorganised 29.12.95, Stephen Tweedie */ +#include #include #include #include @@ -179,7 +180,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, 0); + nr_blocks, GFP_KERNEL); if (err) return err; cond_resched(); @@ -190,7 +191,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, 0); + nr_blocks, GFP_KERNEL); if (err) break; @@ -254,7 +255,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, 0)) + nr_blocks, GFP_NOIO)) break; se = next_se(se); @@ -2466,7 +2467,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); - if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) + if (!p->bdev || !bdev_nonrot(p->bdev)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); @@ -2761,7 +2762,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ - if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) + if (bdev_is_zoned(p->bdev)) return -EINVAL; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { @@ -2957,20 +2958,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return nr_extents; } -/* - * Helper to sys_swapon determining if a given swap - * backing device queue supports DISCARD operations. - */ -static bool swap_discardable(struct swap_info_struct *si) -{ - struct request_queue *q = bdev_get_queue(si->bdev); - - if (!blk_queue_discard(q)) - return false; - - return true; -} - SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -3065,13 +3052,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } - if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) + if (p->bdev && bdev_stable_writes(p->bdev)) p->flags |= SWP_STABLE_WRITES; if (p->bdev && p->bdev->bd_disk->fops->rw_page) p->flags |= SWP_SYNCHRONOUS_IO; - if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + if (p->bdev && bdev_nonrot(p->bdev)) { int cpu; unsigned long ci, nr_cluster; @@ -3132,7 +3119,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sizeof(long), GFP_KERNEL); - if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + if ((swap_flags & SWAP_FLAG_DISCARD) && + p->bdev && bdev_max_discard_sectors(p->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in