From fd0b57f1cc59468e359799d6fec0d352b131fb0d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jun 2020 11:27:47 -0700 Subject: [PATCH 1/5] Revert "writeback: Avoid skipping inode writeback" This reverts commit 83af290d0975eda561a0599f54991c62842a8a99. --- fs/fs-writeback.c | 39 ++++++++++----------------------------- include/linux/fs.h | 8 ++------ 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 36dda58f2846..471d863958bc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -160,9 +160,7 @@ static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); - assert_spin_locked(&inode->i_lock); - inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -996,9 +994,7 @@ void inode_io_list_del(struct inode *inode) struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); - spin_lock(&inode->i_lock); inode_io_list_del_locked(inode, wb); - spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } @@ -1047,9 +1043,8 @@ void sb_clear_inode_writeback(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void __redirty_tail(struct inode *inode, struct bdi_writeback *wb) +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&inode->i_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -1058,14 +1053,6 @@ static void __redirty_tail(struct inode *inode, struct bdi_writeback *wb) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); - inode->i_state &= ~I_SYNC_QUEUED; -} - -static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) -{ - spin_lock(&inode->i_lock); - __redirty_tail(inode, wb); - spin_unlock(&inode->i_lock); } /* @@ -1134,11 +1121,8 @@ static int move_expired_inodes(struct list_head *delaying_queue, break; list_move(&inode->i_io_list, &tmp); moved++; - spin_lock(&inode->i_lock); if (flags & EXPIRE_DIRTY_ATIME) - inode->i_state |= I_DIRTY_TIME_EXPIRED; - inode->i_state |= I_SYNC_QUEUED; - spin_unlock(&inode->i_lock); + set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -1281,7 +1265,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - __redirty_tail(inode, wb); + redirty_tail(inode, wb); return; } @@ -1301,7 +1285,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - __redirty_tail(inode, wb); + redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -1309,11 +1293,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * such as delayed allocation during submission or metadata * updates after data IO completion. */ - __redirty_tail(inode, wb); + redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); - inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ inode_io_list_del_locked(inode, wb); @@ -1557,9 +1540,8 @@ static long writeback_sb_inodes(struct super_block *sb, */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - inode->i_state &= ~I_SYNC_QUEUED; - __redirty_tail(inode, wb); spin_unlock(&inode->i_lock); + redirty_tail(inode, wb); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { @@ -2178,12 +2160,11 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->i_state |= flags; /* - * If the inode is queued for writeback by flush worker, just - * update its dirty state. Once the flush worker is done with - * the inode it will place it on the appropriate superblock - * list, based upon its state. + * If the inode is being synced, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. */ - if (inode->i_state & I_SYNC_QUEUED) + if (inode->i_state & I_SYNC) goto out_unlock_inode; /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 6675c2ee96d6..4b8b7898623a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2051,10 +2051,6 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * * I_CREATING New object's inode in the middle of setting up. * - * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. - * Used to detect that mark_inode_dirty() should not move - * inode between dirty lists. - * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2072,11 +2068,11 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) #define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) -#define I_DIRTY_TIME_EXPIRED (1 << 12) +#define __I_DIRTY_TIME_EXPIRED 12 +#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) -#define I_SYNC_QUEUED (1 << 16) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) From 64cf8ca4162935a3af9464e9ed34c2761b709ebb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Jun 2020 17:36:03 +0200 Subject: [PATCH 2/5] writeback: Protect inode->i_io_list with inode->i_lock Currently, operations on inode->i_io_list are protected by wb->list_lock. In the following patches we'll need to maintain consistency between inode->i_state and inode->i_io_list so change the code so that inode->i_lock protects also all inode's i_io_list handling. Reviewed-by: Martijn Coenen Reviewed-by: Christoph Hellwig CC: stable@vger.kernel.org # Prerequisite for "writeback: Avoid skipping inode writeback" Signed-off-by: Jan Kara --- fs/fs-writeback.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 471d863958bc..8722ed446773 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -160,6 +160,7 @@ static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); @@ -994,7 +995,9 @@ void inode_io_list_del(struct inode *inode) struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); + spin_lock(&inode->i_lock); inode_io_list_del_locked(inode, wb); + spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } @@ -1043,8 +1046,10 @@ void sb_clear_inode_writeback(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { + assert_spin_locked(&inode->i_lock); + if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -1055,6 +1060,13 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) inode_io_list_move_locked(inode, wb, &wb->b_dirty); } +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +{ + spin_lock(&inode->i_lock); + redirty_tail_locked(inode, wb); + spin_unlock(&inode->i_lock); +} + /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ @@ -1265,7 +1277,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); return; } @@ -1285,7 +1297,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -1293,7 +1305,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * such as delayed allocation during submission or metadata * updates after data IO completion. */ - redirty_tail(inode, wb); + redirty_tail_locked(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); @@ -1540,8 +1552,8 @@ static long writeback_sb_inodes(struct super_block *sb, */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); - redirty_tail(inode, wb); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { From 359caa950dfbaa1b23e9d36f8492e5ba7604c9ed Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 29 May 2020 15:05:22 +0200 Subject: [PATCH 3/5] writeback: Avoid skipping inode writeback Inode's i_io_list list head is used to attach inode to several different lists - wb->{b_dirty, b_dirty_time, b_io, b_more_io}. When flush worker prepares a list of inodes to writeback e.g. for sync(2), it moves inodes to b_io list. Thus it is critical for sync(2) data integrity guarantees that inode is not requeued to any other writeback list when inode is queued for processing by flush worker. That's the reason why writeback_single_inode() does not touch i_io_list (unless the inode is completely clean) and why __mark_inode_dirty() does not touch i_io_list if I_SYNC flag is set. However there are two flaws in the current logic: 1) When inode has only I_DIRTY_TIME set but it is already queued in b_io list due to sync(2), concurrent __mark_inode_dirty(inode, I_DIRTY_SYNC) can still move inode back to b_dirty list resulting in skipping writeback of inode time stamps during sync(2). 2) When inode is on b_dirty_time list and writeback_single_inode() races with __mark_inode_dirty() like: writeback_single_inode() __mark_inode_dirty(inode, I_DIRTY_PAGES) inode->i_state |= I_SYNC __writeback_single_inode() inode->i_state |= I_DIRTY_PAGES; if (inode->i_state & I_SYNC) bail if (!(inode->i_state & I_DIRTY_ALL)) - not true so nothing done We end up with I_DIRTY_PAGES inode on b_dirty_time list and thus standard background writeback will not writeback this inode leading to possible dirty throttling stalls etc. (thanks to Martijn Coenen for this analysis). Fix these problems by tracking whether inode is queued in b_io or b_more_io lists in a new I_SYNC_QUEUED flag. When this flag is set, we know flush worker has queued inode and we should not touch i_io_list. On the other hand we also know that once flush worker is done with the inode it will requeue the inode to appropriate dirty list. When I_SYNC_QUEUED is not set, __mark_inode_dirty() can (and must) move inode to appropriate dirty list. Reported-by: Martijn Coenen Reviewed-by: Martijn Coenen Tested-by: Martijn Coenen Reviewed-by: Christoph Hellwig Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option") CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/fs-writeback.c | 17 ++++++++++++----- include/linux/fs.h | 8 ++++++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8722ed446773..c7702ebdc2c1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -162,6 +162,7 @@ static void inode_io_list_del_locked(struct inode *inode, assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -1058,6 +1059,7 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); + inode->i_state &= ~I_SYNC_QUEUED; } static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) @@ -1133,8 +1135,11 @@ static int move_expired_inodes(struct list_head *delaying_queue, break; list_move(&inode->i_io_list, &tmp); moved++; + spin_lock(&inode->i_lock); if (flags & EXPIRE_DIRTY_ATIME) - set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); + inode->i_state |= I_DIRTY_TIME_EXPIRED; + inode->i_state |= I_SYNC_QUEUED; + spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -1309,6 +1314,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); + inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ inode_io_list_del_locked(inode, wb); @@ -2172,11 +2178,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->i_state |= flags; /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. + * If the inode is queued for writeback by flush worker, just + * update its dirty state. Once the flush worker is done with + * the inode it will place it on the appropriate superblock + * list, based upon its state. */ - if (inode->i_state & I_SYNC) + if (inode->i_state & I_SYNC_QUEUED) goto out_unlock_inode; /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b8b7898623a..2a5c7dbd1da5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2051,6 +2051,10 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * * I_CREATING New object's inode in the middle of setting up. * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2068,11 +2072,11 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) #define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) -#define __I_DIRTY_TIME_EXPIRED 12 -#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) +#define I_DIRTY_TIME_EXPIRED (1 << 12) #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) +#define I_SYNC_QUEUED (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) From c7cba792a6f4a33c8371994708c8244004c78d22 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 29 May 2020 16:08:58 +0200 Subject: [PATCH 4/5] writeback: Fix sync livelock due to b_dirty_time processing When we are processing writeback for sync(2), move_expired_inodes() didn't set any inode expiry value (older_than_this). This can result in writeback never completing if there's steady stream of inodes added to b_dirty_time list as writeback rechecks dirty lists after each writeback round whether there's more work to be done. Fix the problem by using sync(2) start time is inode expiry value when processing b_dirty_time list similarly as for ordinarily dirtied inodes. This requires some refactoring of older_than_this handling which simplifies the code noticeably as a bonus. Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option") CC: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/fs-writeback.c | 44 ++++++++++++-------------------- include/trace/events/writeback.h | 13 +++++----- 2 files changed, 23 insertions(+), 34 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c7702ebdc2c1..3fa57d703c44 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -45,7 +45,6 @@ struct wb_completion { struct wb_writeback_work { long nr_pages; struct super_block *sb; - unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; @@ -1105,16 +1104,13 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) #define EXPIRE_DIRTY_ATIME 0x0001 /* - * Move expired (dirtied before work->older_than_this) dirty inodes from + * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - int flags, - struct wb_writeback_work *work) + int flags, unsigned long dirtied_before) { - unsigned long *older_than_this = NULL; - unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; @@ -1122,16 +1118,9 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; - if ((flags & EXPIRE_DIRTY_ATIME) == 0) - older_than_this = work->older_than_this; - else if (!work->for_sync) { - expire_time = jiffies - (dirtytime_expire_interval * HZ); - older_than_this = &expire_time; - } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (older_than_this && - inode_dirtied_after(inode, *older_than_this)) + if (inode_dirtied_after(inode, dirtied_before)) break; list_move(&inode->i_io_list, &tmp); moved++; @@ -1177,18 +1166,22 @@ out: * | * +--> dequeue for IO */ -static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, + unsigned long dirtied_before) { int moved; + unsigned long time_expire_jif = dirtied_before; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, dirtied_before); + if (!work->for_sync) + time_expire_jif = jiffies - dirtytime_expire_interval * HZ; moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, - EXPIRE_DIRTY_ATIME, work); + EXPIRE_DIRTY_ATIME, time_expire_jif); if (moved) wb_io_lists_populated(wb); - trace_writeback_queue_io(wb, work, moved); + trace_writeback_queue_io(wb, work, dirtied_before, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -1700,7 +1693,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, &work); + queue_io(wb, &work, jiffies); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); blk_finish_plug(&plug); @@ -1720,7 +1713,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * - * older_than_this takes precedence over nr_to_write. So we'll only write back + * dirtied_before takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, @@ -1728,14 +1721,11 @@ static long wb_writeback(struct bdi_writeback *wb, { unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; - unsigned long oldest_jif; + unsigned long dirtied_before = jiffies; struct inode *inode; long progress; struct blk_plug plug; - oldest_jif = jiffies; - work->older_than_this = &oldest_jif; - blk_start_plug(&plug); spin_lock(&wb->list_lock); for (;;) { @@ -1769,14 +1759,14 @@ static long wb_writeback(struct bdi_writeback *wb, * safe. */ if (work->for_kupdate) { - oldest_jif = jiffies - + dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) - oldest_jif = jiffies; + dirtied_before = jiffies; trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) - queue_io(wb, work); + queue_io(wb, work, dirtied_before); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 32db72c7c055..29d09755e5cf 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -360,8 +360,9 @@ DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, + unsigned long dirtied_before, int moved), - TP_ARGS(wb, work, moved), + TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) @@ -371,19 +372,17 @@ TRACE_EVENT(writeback_queue_io, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - unsigned long *older_than_this = work->older_than_this; strncpy(__entry->name, dev_name(wb->bdi->dev), 32); - __entry->older = older_than_this ? *older_than_this : 0; - __entry->age = older_than_this ? - (jiffies - *older_than_this) * 1000 / HZ : -1; + __entry->older = dirtied_before; + __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u", __entry->name, - __entry->older, /* older_than_this in jiffies */ - __entry->age, /* older_than_this in relative milliseconds */ + __entry->older, /* dirtied_before in jiffies */ + __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), __entry->cgroup_ino From 00f6b03b4176b05c58ab06738c95555b1449cc02 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 29 May 2020 16:24:43 +0200 Subject: [PATCH 5/5] writeback: Drop I_DIRTY_TIME_EXPIRE The only use of I_DIRTY_TIME_EXPIRE is to detect in __writeback_single_inode() that inode got there because flush worker decided it's time to writeback the dirty inode time stamps (either because we are syncing or because of age). However we can detect this directly in __writeback_single_inode() and there's no need for the strange propagation with I_DIRTY_TIME_EXPIRE flag. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/ext4/inode.c | 2 +- fs/fs-writeback.c | 28 +++++++++++----------------- include/linux/fs.h | 1 - include/trace/events/writeback.h | 1 - 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 490a2803bbe2..bfd85d4e3413 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5144,7 +5144,7 @@ static int other_inode_match(struct inode * inode, unsigned long ino, (inode->i_state & I_DIRTY_TIME)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3fa57d703c44..5a49dfa9e2bc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1109,7 +1109,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - int flags, unsigned long dirtied_before) + unsigned long dirtied_before) { LIST_HEAD(tmp); struct list_head *pos, *node; @@ -1125,8 +1125,6 @@ static int move_expired_inodes(struct list_head *delaying_queue, list_move(&inode->i_io_list, &tmp); moved++; spin_lock(&inode->i_lock); - if (flags & EXPIRE_DIRTY_ATIME) - inode->i_state |= I_DIRTY_TIME_EXPIRED; inode->i_state |= I_SYNC_QUEUED; spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) @@ -1174,11 +1172,11 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, dirtied_before); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before); if (!work->for_sync) time_expire_jif = jiffies - dirtytime_expire_interval * HZ; moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, - EXPIRE_DIRTY_ATIME, time_expire_jif); + time_expire_jif); if (moved) wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, dirtied_before, moved); @@ -1354,18 +1352,14 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - if (inode->i_state & I_DIRTY_TIME) { - if ((dirty & I_DIRTY_INODE) || - wbc->sync_mode == WB_SYNC_ALL || - unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || - unlikely(time_after(jiffies, - (inode->dirtied_time_when + - dirtytime_expire_interval * HZ)))) { - dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; - trace_writeback_lazytime(inode); - } - } else - inode->i_state &= ~I_DIRTY_TIME_EXPIRED; + if ((inode->i_state & I_DIRTY_TIME) && + ((dirty & I_DIRTY_INODE) || + wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync || + time_after(jiffies, inode->dirtied_time_when + + dirtytime_expire_interval * HZ))) { + dirty |= I_DIRTY_TIME; + trace_writeback_lazytime(inode); + } inode->i_state &= ~dirty; /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a5c7dbd1da5..be331da42955 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2072,7 +2072,6 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) #define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) -#define I_DIRTY_TIME_EXPIRED (1 << 12) #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 29d09755e5cf..146e7b3faa85 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -20,7 +20,6 @@ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ - {I_DIRTY_TIME_EXPIRED, "I_DIRTY_TIME_EXPIRED"}, \ {I_REFERENCED, "I_REFERENCED"} \ )