From 841c0b7f6c8d551562286805fc3292d2d30c7af4 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 26 Feb 2017 15:50:52 +0200 Subject: [PATCH 001/555] UPSTREAM: ipv4: add missing initialization for flowi4_uid commit 8bcfd0925ef15f072ba1e7bee2c25e9e1b5fd6ca upstream. Avoid matching of random stack value for uid when rules are looked up on input route or when RP filter is used. Problem should affect only setups that use ip rules with uid range. Fixes: 622ec2c9d524 ("net: core: add UID to flows, rules, and routes") Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller Change-Id: I482d3e283a7515ea2d6a84920d66839341e63be5 Fixes: ("UPSTREAM: net: core: add UID to flows, rules, and routes") Signed-off-by: Amit Pundir --- net/ipv4/fib_frontend.c | 6 +++--- net/ipv4/route.c | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c8409ca0abd2..2ec005c6217c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, int ret, no_addr; struct fib_result res; struct flowi4 fl4; - struct net *net; + struct net *net = dev_net(dev); bool dev_match; fl4.flowi4_oif = 0; @@ -332,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); no_addr = idev->ifa_list == NULL; @@ -339,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, trace_fib_validate_source(dev, &fl4); - net = dev_net(dev); if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && + if (!rpf && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) goto last_resort; fib_combine_itag(itag, &res); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 80a7a8828876..b21bf597d39e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1860,6 +1860,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; + fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) From 145b1718f4f0c63bfb79e692c22cc675671c282d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 26 Apr 2017 14:03:50 +0000 Subject: [PATCH 002/555] UPSTREAM: fib_rules: fix error return code commit adeb45cbb5057731ce9c47aad93756135d7947bf upstream. Fix to return error code -EINVAL from the error handling case instead of 0, as done elsewhere in this function. Fixes: 622ec2c9d524 ("net: core: add UID to flows, rules, and routes") Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller Change-Id: I969dffb6ae4daeb039fc1a479165d8ff28636eec Fixes: ("UPSTREAM: net: core: add UID to flows, rules, and routes") Signed-off-by: Amit Pundir --- net/core/fib_rules.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index b6791d94841d..31c4041f7586 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -425,6 +425,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[FRA_TUN_ID]) rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + err = -EINVAL; if (tb[FRA_L3MDEV]) { #ifdef CONFIG_NET_L3_MASTER_DEV rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); @@ -446,7 +447,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) else rule->suppress_ifgroup = -1; - err = -EINVAL; if (tb[FRA_GOTO]) { if (rule->action != FR_ACT_GOTO) goto errout_free; @@ -576,8 +576,10 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[FRA_UID_RANGE]) { range = nla_get_kuid_range(tb); - if (!uid_range_set(&range)) + if (!uid_range_set(&range)) { + err = -EINVAL; goto errout; + } } else { range = fib_kuid_range_unset; } From 0b4a2f1d168d179eb396dc20943d6c0f1ae09908 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 17 Mar 2017 14:47:22 +0100 Subject: [PATCH 003/555] UPSTREAM: sched/fair: Fix FTQ noise bench regression commit bc4278987e3874da62edf585fe8b3bdd9b53f638 upstream. A regression of the FTQ noise has been reported by Ying Huang, on the following hardware: 8 threads Intel(R) Core(TM)i7-4770 CPU @ 3.40GHz with 8G memory ... which was caused by this commit: commit 4e5160766fcc ("sched/fair: Propagate asynchrous detach") The only part of the patch that can increase the noise is the update of blocked load of group entity in update_blocked_averages(). We can optimize this call and skip the update of group entity if its load and utilization are already null and there is no pending propagation of load in the task group. This optimization partly restores the noise score. A more agressive optimization has been tried but has shown worse score. Reported-by: ying.huang@linux.intel.com Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: ying.huang@intel.com Fixes: 4e5160766fcc ("sched/fair: Propagate asynchrous detach") Link: http://lkml.kernel.org/r/1489758442-2877-1-git-send-email-vincent.guittot@linaro.org [ Fixed typos, improved layout. ] Signed-off-by: Ingo Molnar Change-Id: Iba93bbda71125ce14519aa30b1beac2bf3c52e1e Fixes: Change-Id: I5d1a9f273111c0bb7503ca3c9ad2406d12867cb5 ("UPSTREAM: sched/fair: Propagate asynchrous detach") Signed-off-by: Amit Pundir --- kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index febc9abab23d..550fa4cecff8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3168,6 +3168,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -8031,6 +8061,8 @@ static void update_blocked_averages(int cpu) * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -8038,9 +8070,10 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } From 47589a56b9062b3ef5aa439f594e0ef7cf504bbf Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Mon, 14 Aug 2017 12:07:21 +0200 Subject: [PATCH 004/555] UPSTREAM: drm/atomic: Handle -EDEADLK with out-fences correctly commit 7f5d6dac548b983702dd7aac1d463bd88dff50a8 upstream. complete_crtc_signaling is freeing fence_state, but when retrying num_fences and fence_state are not zero'd. This caused duplicate fd's in the fence_state array, followed by a BUG_ON in fs/file.c because we reallocate freed memory, and installing over an existing fd, or potential other fun. Zero fence_state and num_fences correctly in the retry loop, which allows kms_atomic_transition to pass. Fixes: beaf5af48034 ("drm/fence: add out-fences support") Cc: Gustavo Padovan Cc: Brian Starkey (v10) Cc: Sean Paul Cc: Daniel Vetter Cc: Jani Nikula Cc: David Airlie Signed-off-by: Maarten Lankhorst Cc: # v4.10+ Testcase: kms_atomic_transitions.plane-all-modeset-transition-fencing (with CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y) Link: https://patchwork.freedesktop.org/patch/msgid/20170814100721.13340-1-maarten.lankhorst@linux.intel.com Reviewed-by: Daniel Vetter #intel-gfx on irc Change-Id: I2cfcc8c4d808061a947b2dc977705e738e48b9e8 Fixes: Change-Id: I5fed59f7358aaf3f3acd7a4ca9c85091f23cdbf5 ("BACKPORT: drm/fence: add out-fences support") Signed-off-by: Amit Pundir --- drivers/gpu/drm/drm_atomic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index 4e16dffa1e2e..33778bf53590 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -1871,10 +1871,10 @@ int drm_mode_atomic_ioctl(struct drm_device *dev, struct drm_atomic_state *state; struct drm_modeset_acquire_ctx ctx; struct drm_plane *plane; - struct drm_out_fence_state *fence_state = NULL; + struct drm_out_fence_state *fence_state; unsigned plane_mask; int ret = 0; - unsigned int i, j, num_fences = 0; + unsigned int i, j, num_fences; /* disallow for drivers not supporting atomic: */ if (!drm_core_check_feature(dev, DRIVER_ATOMIC)) @@ -1915,6 +1915,8 @@ retry: plane_mask = 0; copied_objs = 0; copied_props = 0; + fence_state = NULL; + num_fences = 0; for (i = 0; i < arg->count_objs; i++) { uint32_t obj_id, count_props; From d9f9b83539ab9b1ebb5cbdfa0a5a9994e20e6a0d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 26 Apr 2017 11:55:26 -0400 Subject: [PATCH 005/555] SUNRPC: Refactor svc_set_num_threads() commit 9e0d87680d689f1758185851c3da6eafb16e71e1 upstream. Refactor to separate out the functions of starting and stopping threads so that they can be used in other helpers. Signed-off-by: Trond Myklebust Tested-and-reviewed-by: Kinglong Mee Signed-off-by: J. Bruce Fields Cc: Jan Hudoba Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/svc.c | 108 ++++++++++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 44 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 75f290bddca1..70c9040ed7f7 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -702,6 +702,65 @@ found_pool: return task; } +/* create new threads */ +static int +svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct svc_rqst *rqstp; + struct task_struct *task; + struct svc_pool *chosen_pool; + unsigned int state = serv->sv_nrthreads-1; + int node; + + do { + nrservs--; + chosen_pool = choose_pool(serv, pool, &state); + + node = svc_pool_map_get_node(chosen_pool->sp_id); + rqstp = svc_prepare_thread(serv, chosen_pool, node); + if (IS_ERR(rqstp)) + return PTR_ERR(rqstp); + + __module_get(serv->sv_ops->svo_module); + task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, + node, "%s", serv->sv_name); + if (IS_ERR(task)) { + module_put(serv->sv_ops->svo_module); + svc_exit_thread(rqstp); + return PTR_ERR(task); + } + + rqstp->rq_task = task; + if (serv->sv_nrpools > 1) + svc_pool_map_set_cpumask(task, chosen_pool->sp_id); + + svc_sock_update_bufs(serv); + wake_up_process(task); + } while (nrservs > 0); + + return 0; +} + + +/* destroy old threads */ +static int +svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *task; + unsigned int state = serv->sv_nrthreads-1; + + /* destroy old threads */ + do { + task = choose_victim(serv, pool, &state); + if (task == NULL) + break; + send_sig(SIGINT, task, 1); + nrservs++; + } while (nrservs < 0); + + return 0; +} + /* * Create or destroy enough new threads to make the number * of threads the given number. If `pool' is non-NULL, applies @@ -719,13 +778,6 @@ found_pool: int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - struct svc_rqst *rqstp; - struct task_struct *task; - struct svc_pool *chosen_pool; - int error = 0; - unsigned int state = serv->sv_nrthreads-1; - int node; - if (pool == NULL) { /* The -1 assumes caller has done a svc_get() */ nrservs -= (serv->sv_nrthreads-1); @@ -735,43 +787,11 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) spin_unlock_bh(&pool->sp_lock); } - /* create new threads */ - while (nrservs > 0) { - nrservs--; - chosen_pool = choose_pool(serv, pool, &state); - - node = svc_pool_map_get_node(chosen_pool->sp_id); - rqstp = svc_prepare_thread(serv, chosen_pool, node); - if (IS_ERR(rqstp)) { - error = PTR_ERR(rqstp); - break; - } - - __module_get(serv->sv_ops->svo_module); - task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, - node, "%s", serv->sv_name); - if (IS_ERR(task)) { - error = PTR_ERR(task); - module_put(serv->sv_ops->svo_module); - svc_exit_thread(rqstp); - break; - } - - rqstp->rq_task = task; - if (serv->sv_nrpools > 1) - svc_pool_map_set_cpumask(task, chosen_pool->sp_id); - - svc_sock_update_bufs(serv); - wake_up_process(task); - } - /* destroy old threads */ - while (nrservs < 0 && - (task = choose_victim(serv, pool, &state)) != NULL) { - send_sig(SIGINT, task, 1); - nrservs++; - } - - return error; + if (nrservs > 0) + return svc_start_kthreads(serv, pool, nrservs); + if (nrservs < 0) + return svc_signal_kthreads(serv, pool, nrservs); + return 0; } EXPORT_SYMBOL_GPL(svc_set_num_threads); From f609266b12d214437cf9d68245dc27f8d4f69836 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 26 Apr 2017 11:55:27 -0400 Subject: [PATCH 006/555] NFSv4: Fix callback server shutdown commit ed6473ddc704a2005b9900ca08e236ebb2d8540a upstream. We want to use kthread_stop() in order to ensure the threads are shut down before we tear down the nfs_callback_info in nfs_callback_down. Tested-and-reviewed-by: Kinglong Mee Reported-by: Kinglong Mee Fixes: bb6aeba736ba9 ("NFSv4.x: Switch to using svc_set_num_threads()...") Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields Cc: Jan Hudoba Signed-off-by: Greg Kroah-Hartman --- fs/nfs/callback.c | 24 ++++++++++++++++-------- include/linux/sunrpc/svc.h | 1 + net/sunrpc/svc.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0a2115084c3f..582bfee40345 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -75,7 +75,10 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); /* * Listen for a request on the socket */ @@ -84,6 +87,8 @@ nfs4_callback_svc(void *vrqstp) continue; svc_process(rqstp); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -102,9 +107,10 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { - if (try_to_freeze()) - continue; + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); @@ -120,11 +126,13 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - schedule(); + if (!kthread_should_stop()) + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } - flush_signals(current); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -220,14 +228,14 @@ err_bind: static struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 7321ae933867..102c84dcc11a 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -470,6 +470,7 @@ void svc_pool_map_put(void); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, struct svc_serv_ops *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); +int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); void svc_destroy(struct svc_serv *); void svc_shutdown_net(struct svc_serv *, struct net *); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 70c9040ed7f7..272c34551979 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -795,6 +795,44 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) } EXPORT_SYMBOL_GPL(svc_set_num_threads); +/* destroy old threads */ +static int +svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *task; + unsigned int state = serv->sv_nrthreads-1; + + /* destroy old threads */ + do { + task = choose_victim(serv, pool, &state); + if (task == NULL) + break; + kthread_stop(task); + nrservs++; + } while (nrservs < 0); + return 0; +} + +int +svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); + } + + if (nrservs > 0) + return svc_start_kthreads(serv, pool, nrservs); + if (nrservs < 0) + return svc_stop_kthreads(serv, pool, nrservs); + return 0; +} +EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); + /* * Called from a server thread as it's exiting. Caller must hold the "service * mutex" for the service. From 39f5677232ab5b09293506c82e5d7a9f3426cc11 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:08 -0800 Subject: [PATCH 007/555] mm: prevent double decrease of nr_reserved_highatomic commit 4855e4a7f29d6d10b0b9c84e189c770c9a94e91e upstream. There is race between page freeing and unreserved highatomic. CPU 0 CPU 1 free_hot_cold_page mt = get_pfnblock_migratetype set_pcppage_migratetype(page, mt) unreserve_highatomic_pageblock spin_lock_irqsave(&zone->lock) move_freepages_block set_pageblock_migratetype(page) spin_unlock_irqrestore(&zone->lock) free_pcppages_bulk __free_one_page(mt) <- mt is stale By above race, a page on CPU 0 could go non-highorderatomic free list since the pageblock's type is changed. By that, unreserve logic of highorderatomic can decrease reserved count on a same pageblock severak times and then it will make mismatch between nr_reserved_highatomic and the number of reserved pageblock. So, this patch verifies whether the pageblock is highatomic or not and decrease the count only if the pageblock is highatomic. Link: http://lkml.kernel.org/r/1476259429-18279-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Miles Chen Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2abf8d5f0ad4..7064aae8ded7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2100,13 +2100,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) continue; /* - * It should never happen but changes to locking could - * inadvertently allow a per-cpu drain to add pages - * to MIGRATE_HIGHATOMIC while unreserving so be safe - * and watch for underflows. + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop althoug we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. */ - zone->nr_reserved_highatomic -= min(pageblock_nr_pages, - zone->nr_reserved_highatomic); + if (get_pageblock_migratetype(page) == + MIGRATE_HIGHATOMIC) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } /* * Convert to ac->migratetype and avoid the normal From e148702302c5212f738b9432f16d54027d9f15b7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2017 15:31:13 +0200 Subject: [PATCH 008/555] orangefs: Don't clear SGID when inheriting ACLs commit b5accbb0dfae36d8d36cd882096943c98d5ede15 upstream. When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by creating __orangefs_set_acl() function that does not call posix_acl_update_mode() and use it when inheriting ACLs. That prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: stable@vger.kernel.org CC: Mike Marshall CC: pvfs2-developers@beowulf-underground.org Signed-off-by: Jan Kara Signed-off-by: Mike Marshall Signed-off-by: Greg Kroah-Hartman --- fs/orangefs/acl.c | 48 +++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 7a3754488312..9409aac232f7 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -61,9 +61,9 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type) return acl; } -int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, + int type) { - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); int error = 0; void *value = NULL; size_t size = 0; @@ -72,22 +72,6 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - umode_t mode; - - error = posix_acl_update_mode(inode, &mode, &acl); - if (error) { - gossip_err("%s: posix_acl_update_mode err: %d\n", - __func__, - error); - return error; - } - - if (inode->i_mode != mode) - SetModeFlag(orangefs_inode); - inode->i_mode = mode; - mark_inode_dirty_sync(inode); - } break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; @@ -132,6 +116,29 @@ out: return error; } +int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS && acl) { + umode_t mode; + + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) { + gossip_err("%s: posix_acl_update_mode err: %d\n", + __func__, + error); + return error; + } + + if (inode->i_mode != mode) + SetModeFlag(ORANGEFS_I(inode)); + inode->i_mode = mode; + mark_inode_dirty_sync(inode); + } + return __orangefs_set_acl(inode, acl, type); +} + int orangefs_init_acl(struct inode *inode, struct inode *dir) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); @@ -146,13 +153,14 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir) return error; if (default_acl) { - error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __orangefs_set_acl(inode, default_acl, + ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!error) - error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } From a29aeb834a96015f7c38beb501438af54cd85707 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 21 Aug 2017 18:26:20 -0700 Subject: [PATCH 009/555] IB/{qib, hfi1}: Avoid flow control testing for RDMA write operation commit 5b0ef650bd0f820e922fcc42f1985d4621ae19cf upstream. Section 9.7.7.2.5 of the 1.3 IBTA spec clearly says that receive credits should never apply to RDMA write. qib and hfi1 were doing that. The following situation will result in a QP hang: - A prior SEND or RDMA_WRITE with immmediate consumed the last credit for a QP using RC receive buffer credits - The prior op is acked so there are no more acks - The peer ULP fails to post receive for some reason - An RDMA write sees that the credits are exhausted and waits - The peer ULP posts receive buffers - The ULP posts a send or RDMA write that will be hung The fix is to avoid the credit test for the RDMA write operation. Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/hfi1/rc.c | 3 ++- drivers/infiniband/hw/qib/qib_rc.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 4bd5b5caa243..613074e963bb 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -551,7 +551,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) case IB_WR_RDMA_WRITE: if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; - /* FALLTHROUGH */ + goto no_flow_control; case IB_WR_RDMA_WRITE_WITH_IMM: /* If no credit, return. */ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && @@ -559,6 +559,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; goto bail; } +no_flow_control: put_ib_reth_vaddr( wqe->rdma_wr.remote_addr, &ohdr->u.rc.reth); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index f3fe787c9426..c1523f9a3c12 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -357,7 +357,7 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) case IB_WR_RDMA_WRITE: if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; - /* FALLTHROUGH */ + goto no_flow_control; case IB_WR_RDMA_WRITE_WITH_IMM: /* If no credit, return. */ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && @@ -365,7 +365,7 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; goto bail; } - +no_flow_control: ohdr->u.rc.reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr); ohdr->u.rc.reth.rkey = From 0fda166fcec852b4008d9256189cef8947041af2 Mon Sep 17 00:00:00 2001 From: Jonathan Liu Date: Mon, 10 Jul 2017 16:55:04 +1000 Subject: [PATCH 010/555] drm/sun4i: Implement drm_driver lastclose to restore fbdev console commit 2a596fc9d974bb040eda9ab70bf8756fcaaa6afe upstream. The drm_driver lastclose callback is called when the last userspace DRM client has closed. Call drm_fbdev_cma_restore_mode to restore the fbdev console otherwise the fbdev console will stop working. Fixes: 9026e0d122ac ("drm: Add Allwinner A10 Display Engine support") Tested-by: Olliver Schinagl Reviewed-by: Chen-Yu Tsai Signed-off-by: Jonathan Liu Signed-off-by: Maxime Ripard [net147@gmail.com: Backport to 4.9, minor context change] Signed-off-by: Jonathan Liu Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/sun4i/sun4i_drv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c index c3b21865443e..1feec34ca9dd 100644 --- a/drivers/gpu/drm/sun4i/sun4i_drv.c +++ b/drivers/gpu/drm/sun4i/sun4i_drv.c @@ -47,6 +47,13 @@ static void sun4i_drv_disable_vblank(struct drm_device *drm, unsigned int pipe) sun4i_tcon_enable_vblank(tcon, false); } +static void sun4i_drv_lastclose(struct drm_device *dev) +{ + struct sun4i_drv *drv = dev->dev_private; + + drm_fbdev_cma_restore_mode(drv->fbdev); +} + static const struct file_operations sun4i_drv_fops = { .owner = THIS_MODULE, .open = drm_open, @@ -65,6 +72,7 @@ static struct drm_driver sun4i_drv_driver = { .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_PRIME | DRIVER_ATOMIC, /* Generic Operations */ + .lastclose = sun4i_drv_lastclose, .fops = &sun4i_drv_fops, .name = "sun4i-drm", .desc = "Allwinner sun4i Display Engine", From 2f8b06f906fd5e7b71a001c9f8766550d6992331 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 6 Jun 2017 09:22:00 -0700 Subject: [PATCH 011/555] IB/addr: Fix setting source address in addr6_resolve() commit 79e25959403e6a79552db28a87abed34de32a1df upstream. Commit eea40b8f624f ("infiniband: call ipv6 route lookup via the stub interface") introduced a regression in address resolution when connecting to IPv6 destination addresses. The old code called ip6_route_output(), while the new code calls ipv6_stub->ipv6_dst_lookup(). The two are almost the same, except that ipv6_dst_lookup() also calls ip6_route_get_saddr() if the source address is in6addr_any. This means that the test of ipv6_addr_any(&fl6.saddr) now never succeeds, and so we never copy the source address out. This ends up causing rdma_resolve_addr() to fail, because without a resolved source address, cma_acquire_dev() will fail to find an RDMA device to use. For me, this causes connecting to an NVMe over Fabrics target via RoCE / IPv6 to fail. Fix this by copying out fl6.saddr if ipv6_addr_any() is true for the original source address passed into addr6_resolve(). We can drop our call to ipv6_dev_get_saddr() because ipv6_dst_lookup() already does that work. Fixes: eea40b8f624 ("infiniband: call ipv6 route lookup via the stub interface") Signed-off-by: Roland Dreier Acked-by: Paolo Abeni Signed-off-by: Doug Ledford Signed-off-by: Raju Rangoju Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/addr.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 63e82f8e8308..fb4ce0394ac7 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -446,15 +446,10 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, ret = ipv6_stub->ipv6_dst_lookup(addr->net, NULL, &dst, &fl6); if (ret < 0) - goto put; + return ret; rt = (struct rt6_info *)dst; - if (ipv6_addr_any(&fl6.saddr)) { - ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev, - &fl6.daddr, 0, &fl6.saddr); - if (ret) - goto put; - + if (ipv6_addr_any(&src_in->sin6_addr)) { src_in->sin6_family = AF_INET6; src_in->sin6_addr = fl6.saddr; } @@ -471,9 +466,6 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, *pdst = dst; return 0; -put: - dst_release(dst); - return ret; } #else static int addr6_resolve(struct sockaddr_in6 *src_in, From f61a07f3fe97c26ec99531be8aad6c7fae3f714d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 20 Jun 2017 23:10:41 +0200 Subject: [PATCH 012/555] tty: improve tty_insert_flip_char() fast path commit 979990c6284814617d8f2179d197f72ff62b5d85 upstream. kernelci.org reports a crazy stack usage for the VT code when CONFIG_KASAN is enabled: drivers/tty/vt/keyboard.c: In function 'kbd_keycode': drivers/tty/vt/keyboard.c:1452:1: error: the frame size of 2240 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] The problem is that tty_insert_flip_char() gets inlined many times into kbd_keycode(), and also into other functions, and each copy requires 128 bytes for stack redzone to check for a possible out-of-bounds access on the 'ch' and 'flags' arguments that are passed into tty_insert_flip_string_flags as a variable-length string. This introduces a new __tty_insert_flip_char() function for the slow path, which receives the two arguments by value. This completely avoids the problem and the stack usage goes back down to around 100 bytes. Without KASAN, this is also slightly better, as we don't have to spill the arguments to the stack but can simply pass 'ch' and 'flag' in registers, saving a few bytes in .text for each call site. This should be backported to linux-4.0 or later, which first introduced the stack sanitizer in the kernel. Fixes: c420f167db8c ("kasan: enable stack instrumentation") Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_buffer.c | 24 ++++++++++++++++++++++++ include/linux/tty_flip.h | 3 ++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index aa80dc94ddc2..013693b59c29 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -361,6 +361,30 @@ int tty_insert_flip_string_flags(struct tty_port *port, } EXPORT_SYMBOL(tty_insert_flip_string_flags); +/** + * __tty_insert_flip_char - Add one character to the tty buffer + * @port: tty port + * @ch: character + * @flag: flag byte + * + * Queue a single byte to the tty buffering, with an optional flag. + * This is the slow path of tty_insert_flip_char. + */ +int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag) +{ + struct tty_buffer *tb = port->buf.tail; + int flags = (flag == TTY_NORMAL) ? TTYB_NORMAL : 0; + + if (!tty_buffer_request_room(port, 1)) + return 0; + + *flag_buf_ptr(tb, tb->used) = flag; + *char_buf_ptr(tb, tb->used++) = ch; + + return 1; +} +EXPORT_SYMBOL(__tty_insert_flip_char); + /** * tty_schedule_flip - push characters to ldisc * @port: tty port to push from diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h index c28dd523f96e..d43837f2ce3a 100644 --- a/include/linux/tty_flip.h +++ b/include/linux/tty_flip.h @@ -12,6 +12,7 @@ extern int tty_prepare_flip_string(struct tty_port *port, unsigned char **chars, size_t size); extern void tty_flip_buffer_push(struct tty_port *port); void tty_schedule_flip(struct tty_port *port); +int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag); static inline int tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag) @@ -26,7 +27,7 @@ static inline int tty_insert_flip_char(struct tty_port *port, *char_buf_ptr(tb, tb->used++) = ch; return 1; } - return tty_insert_flip_string_flags(port, &ch, &flag, 1); + return __tty_insert_flip_char(port, ch, flag); } static inline int tty_insert_flip_string(struct tty_port *port, From 750462424193bfa48155c26e0186f2bb5a4485e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 20 Jun 2017 23:10:42 +0200 Subject: [PATCH 013/555] tty: improve tty_insert_flip_char() slow path commit 065ea0a7afd64d6cf3464bdd1d8cd227527e2045 upstream. While working on improving the fast path of tty_insert_flip_char(), I noticed that by calling tty_buffer_request_room(), we needlessly move to the separate flag buffer mode for the tty, even when all characters use TTY_NORMAL as the flag. This changes the code to call __tty_buffer_request_room() with the correct flag, which will then allocate a regular buffer when it rounds out of space but no special flags have been used. I'm guessing that this is the behavior that Peter Hurley intended when he introduced the compacted flip buffers. Fixes: acc0f67f307f ("tty: Halve flip buffer GFP_ATOMIC memory consumption") Cc: Peter Hurley Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_buffer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 013693b59c29..6b1dc32cfa22 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -375,10 +375,11 @@ int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag) struct tty_buffer *tb = port->buf.tail; int flags = (flag == TTY_NORMAL) ? TTYB_NORMAL : 0; - if (!tty_buffer_request_room(port, 1)) + if (!__tty_buffer_request_room(port, 1, flags)) return 0; - *flag_buf_ptr(tb, tb->used) = flag; + if (~tb->flags & TTYB_NORMAL) + *flag_buf_ptr(tb, tb->used) = flag; *char_buf_ptr(tb, tb->used++) = ch; return 1; From 346abf2aca7faeea9f7d615239e94b37b0ee6df2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Aug 2017 13:11:39 +0200 Subject: [PATCH 014/555] tty: fix __tty_insert_flip_char regression commit 8a5a90a2a477b86a3dc2eaa5a706db9bfdd647ca upstream. Sergey noticed a small but fatal mistake in __tty_insert_flip_char, leading to an oops in an interrupt handler when using any serial port. The problem is that I accidentally took the tty_buffer pointer before calling __tty_buffer_request_room(), which replaces the buffer. This moves the pointer lookup to the right place after allocating the new buffer space. Fixes: 979990c62848 ("tty: improve tty_insert_flip_char() fast path") Reported-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 6b1dc32cfa22..c220c2c0893f 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -372,12 +372,13 @@ EXPORT_SYMBOL(tty_insert_flip_string_flags); */ int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag) { - struct tty_buffer *tb = port->buf.tail; + struct tty_buffer *tb; int flags = (flag == TTY_NORMAL) ? TTYB_NORMAL : 0; if (!__tty_buffer_request_room(port, 1, flags)) return 0; + tb = port->buf.tail; if (~tb->flags & TTYB_NORMAL) *flag_buf_ptr(tb, tb->used) = flag; *char_buf_ptr(tb, tb->used++) = ch; From 6053a5fec5692f65b1dcf191ef88f147fc0f7525 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Mon, 11 Sep 2017 14:11:56 +0800 Subject: [PATCH 015/555] pinctrl/amd: save pin registers over suspend/resume commit 79d2c8bede2c93f9432d7da0bc2f76a195c90fc0 upstream. The touchpad in the Asus laptop models X505BA/BP and X542BA/BP is unresponsive after suspend/resume. The following error appears during resume: i2c_hid i2c-ELAN1300:00: failed to reset device. The problem here is that i2c_hid does not notice the interrupt being generated at this point, because the GPIO is no longer configured for interrupts. Fix this by saving pinctrl-amd pin registers during suspend and restoring them at resume time. Based on code from pinctrl-intel. Signed-off-by: Daniel Drake Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-amd.c | 75 +++++++++++++++++++++++++++++++++++ drivers/pinctrl/pinctrl-amd.h | 1 + 2 files changed, 76 insertions(+) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index c9a146948192..a5b7bd3c9bac 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -32,6 +32,7 @@ #include #include +#include "core.h" #include "pinctrl-utils.h" #include "pinctrl-amd.h" @@ -712,6 +713,69 @@ static const struct pinconf_ops amd_pinconf_ops = { .pin_config_group_set = amd_pinconf_group_set, }; +#ifdef CONFIG_PM_SLEEP +static bool amd_gpio_should_save(struct amd_gpio *gpio_dev, unsigned int pin) +{ + const struct pin_desc *pd = pin_desc_get(gpio_dev->pctrl, pin); + + if (!pd) + return false; + + /* + * Only restore the pin if it is actually in use by the kernel (or + * by userspace). + */ + if (pd->mux_owner || pd->gpio_owner || + gpiochip_line_is_irq(&gpio_dev->gc, pin)) + return true; + + return false; +} + +int amd_gpio_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct amd_gpio *gpio_dev = platform_get_drvdata(pdev); + struct pinctrl_desc *desc = gpio_dev->pctrl->desc; + int i; + + for (i = 0; i < desc->npins; i++) { + int pin = desc->pins[i].number; + + if (!amd_gpio_should_save(gpio_dev, pin)) + continue; + + gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin*4); + } + + return 0; +} + +int amd_gpio_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct amd_gpio *gpio_dev = platform_get_drvdata(pdev); + struct pinctrl_desc *desc = gpio_dev->pctrl->desc; + int i; + + for (i = 0; i < desc->npins; i++) { + int pin = desc->pins[i].number; + + if (!amd_gpio_should_save(gpio_dev, pin)) + continue; + + writel(gpio_dev->saved_regs[i], gpio_dev->base + pin*4); + } + + return 0; +} + +static const struct dev_pm_ops amd_gpio_pm_ops = { + SET_LATE_SYSTEM_SLEEP_PM_OPS(amd_gpio_suspend, + amd_gpio_resume) +}; +#endif + static struct pinctrl_desc amd_pinctrl_desc = { .pins = kerncz_pins, .npins = ARRAY_SIZE(kerncz_pins), @@ -751,6 +815,14 @@ static int amd_gpio_probe(struct platform_device *pdev) return -EINVAL; } +#ifdef CONFIG_PM_SLEEP + gpio_dev->saved_regs = devm_kcalloc(&pdev->dev, amd_pinctrl_desc.npins, + sizeof(*gpio_dev->saved_regs), + GFP_KERNEL); + if (!gpio_dev->saved_regs) + return -ENOMEM; +#endif + gpio_dev->pdev = pdev; gpio_dev->gc.direction_input = amd_gpio_direction_input; gpio_dev->gc.direction_output = amd_gpio_direction_output; @@ -839,6 +911,9 @@ static struct platform_driver amd_gpio_driver = { .driver = { .name = "amd_gpio", .acpi_match_table = ACPI_PTR(amd_gpio_acpi_match), +#ifdef CONFIG_PM_SLEEP + .pm = &amd_gpio_pm_ops, +#endif }, .probe = amd_gpio_probe, .remove = amd_gpio_remove, diff --git a/drivers/pinctrl/pinctrl-amd.h b/drivers/pinctrl/pinctrl-amd.h index 7bfea47dbb47..e8bbb20779d0 100644 --- a/drivers/pinctrl/pinctrl-amd.h +++ b/drivers/pinctrl/pinctrl-amd.h @@ -95,6 +95,7 @@ struct amd_gpio { struct gpio_chip gc; struct resource *res; struct platform_device *pdev; + u32 *saved_regs; }; /* KERNCZ configuration*/ From fcaec235666c3bebaa618577b98e8ae330ebb678 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 15 Sep 2017 09:36:16 -0700 Subject: [PATCH 016/555] Input: i8042 - add Gigabyte P57 to the keyboard reset table commit 697c5d8a36768b36729533fb44622b35d56d6ad0 upstream. Similar to other Gigabyte laptops, the touchpad on P57 requires a keyboard reset to detect Elantech touchpad correctly. BugLink: https://bugs.launchpad.net/bugs/1594214 Signed-off-by: Kai-Heng Feng Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/serio/i8042-x86ia64io.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h index 5be14ad29d46..dbf09836ff30 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-x86ia64io.h @@ -904,6 +904,13 @@ static const struct dmi_system_id __initconst i8042_dmi_kbdreset_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "P34"), }, }, + { + /* Gigabyte P57 - Elantech touchpad */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GIGABYTE"), + DMI_MATCH(DMI_PRODUCT_NAME, "P57"), + }, + }, { /* Schenker XMG C504 - Elantech touchpad */ .matches = { From b234149cf77b0e73f6813d1c72825789d0259b80 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:48 +0200 Subject: [PATCH 017/555] MIPS: math-emu: .: Fix quiet NaN propagation commit e78bf0dc4789bdea1453595ae89e8db65918e22e upstream. Fix the value returned by . fd,fs,ft, if both inputs are quiet NaNs. The . specifications state that the returned value in such cases should be the quiet NaN contained in register fs. A relevant example: MAX.S fd,fs,ft: If fs contains qNaN1, and ft contains qNaN2, fd is going to contain qNaN1 (without this patch, it used to contain qNaN2). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16880/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 32 ++++++++++++++++++++++++++++---- arch/mips/math-emu/dp_fmin.c | 32 ++++++++++++++++++++++++++++---- arch/mips/math-emu/sp_fmax.c | 32 ++++++++++++++++++++++++++++---- arch/mips/math-emu/sp_fmin.c | 32 ++++++++++++++++++++++++++++---- 4 files changed, 112 insertions(+), 16 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index fd71b8daaaf2..41bd6ed852b9 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -47,14 +47,26 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -147,14 +159,26 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index c1072b0dfb95..53fb8c904e32 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -47,14 +47,26 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -147,14 +159,26 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index 4d000844e48e..d0d73c3226dc 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -47,14 +47,26 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -147,14 +159,26 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index 4eb1bb9e9dec..011692e326af 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -47,14 +47,26 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -147,14 +159,26 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): From 694f6ea0a4e2edd69c1278d4fba77c670276b416 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:49 +0200 Subject: [PATCH 018/555] MIPS: math-emu: .: Fix cases of both inputs zero commit 15560a58bfd4ff82cdd16b2270d4ef9b06d2cc4d upstream. Fix the value returned by ., if both inputs are zeros. The right behavior in such cases is stated in instruction reference manual and is as follows: fs ft MAX MIN MAXA MINA --------------------------------------------- 0 0 0 0 0 0 0 -0 0 -0 0 -0 -0 0 0 -0 0 -0 -0 -0 -0 -0 -0 -0 Prior to this patch, some of the above cases were yielding correct results. However, for the sake of code consistency, all such cases are rewritten in this patch. A relevant example: MAX.S fd,fs,ft: If fs contains +0.0, and ft contains -0.0, fd is going to contain +0.0 (without this patch, it used to contain -0.0). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16881/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 8 ++------ arch/mips/math-emu/dp_fmin.c | 8 ++------ arch/mips/math-emu/sp_fmax.c | 8 ++------ arch/mips/math-emu/sp_fmin.c | 8 ++------ 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index 41bd6ed852b9..31f091a7819b 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -92,9 +92,7 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y) return ys ? x : y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; @@ -204,9 +202,7 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index 53fb8c904e32..e607d55208ad 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -92,9 +92,7 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y) return ys ? y : x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; @@ -204,9 +202,7 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index d0d73c3226dc..3ca5b204e9d0 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -92,9 +92,7 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y) return ys ? x : y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; @@ -204,9 +202,7 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index 011692e326af..c982647df39a 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -92,9 +92,7 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y) return ys ? y : x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; @@ -204,9 +202,7 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; From d2b6fcb0b6de3911de9b010b67237b19e4cb21ec Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:50 +0200 Subject: [PATCH 019/555] MIPS: math-emu: .: Fix cases of both inputs negative commit aabf5cf02e22ebc4e541adf835910f388b6c3e65 upstream. Fix the value returned by ., if both inputs are negative normal fp numbers. The previous logic did not take into account that if both inputs have the same sign, there should be separate treatment of the cases when both inputs are negative and when both inputs are positive. A relevant example: MAX.S fd,fs,ft: If fs contains -5.0, and ft contains -7.0, fd is going to contain -5.0 (without this patch, it used to contain -7.0). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16882/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 32 ++++++++++++++++++++++++-------- arch/mips/math-emu/dp_fmin.c | 32 ++++++++++++++++++++++++-------- arch/mips/math-emu/sp_fmax.c | 32 ++++++++++++++++++++++++-------- arch/mips/math-emu/sp_fmin.c | 32 ++++++++++++++++++++++++-------- 4 files changed, 96 insertions(+), 32 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index 31f091a7819b..0b53c7861101 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -116,16 +116,32 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y) else if (xs < ys) return x; - /* Compare exponent */ - if (xe > ye) - return x; - else if (xe < ye) - return y; + /* Signs of inputs are equal, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } else { + /* Inputs are both negative */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return y; + return x; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return y; - return x; + return x; + return y; } union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index e607d55208ad..099e6bd55353 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -116,16 +116,32 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y) else if (xs < ys) return y; - /* Compare exponent */ - if (xe > ye) - return y; - else if (xe < ye) - return x; + /* Signs of inputs are the same, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } else { + /* Inputs are both negative */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return x; + return y; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return x; - return y; + return y; + return x; } union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index 3ca5b204e9d0..7efa7729bd85 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -116,16 +116,32 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y) else if (xs < ys) return x; - /* Compare exponent */ - if (xe > ye) - return x; - else if (xe < ye) - return y; + /* Signs of inputs are equal, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } else { + /* Inputs are both negative */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return y; + return x; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return y; - return x; + return x; + return y; } union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index c982647df39a..e2c554359f7b 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -116,16 +116,32 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y) else if (xs < ys) return y; - /* Compare exponent */ - if (xe > ye) - return y; - else if (xe < ye) - return x; + /* Signs of inputs are the same, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } else { + /* Inputs are both negative */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return x; + return y; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return x; - return y; + return y; + return x; } union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) From a04d53797fca646e038c4d66381877ebf018cf60 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:51 +0200 Subject: [PATCH 020/555] MIPS: math-emu: .: Fix cases of input values with opposite signs commit 1a41b3b441508ae63b1a9ec699ec94065739eb60 upstream. Fix the value returned by ., if the inputs are normal fp numbers of the same absolute value, but opposite signs. A relevant example: MAXA.S fd,fs,ft: If fs contains -3.0, and ft contains +3.0, fd is going to contain +3.0 (without this patch, it used to contain -3.0). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16883/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 8 ++++++-- arch/mips/math-emu/dp_fmin.c | 6 +++++- arch/mips/math-emu/sp_fmax.c | 8 ++++++-- arch/mips/math-emu/sp_fmin.c | 6 +++++- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index 0b53c7861101..81d12bfa6d5e 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -243,7 +243,11 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) return y; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) return y; - return x; + else if (xm > ym) + return x; + else if (xs == 0) + return x; + return y; } diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index 099e6bd55353..4574f04b44ce 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -243,7 +243,11 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) return x; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) + return x; + else if (xm > ym) + return y; + else if (xs == 1) return x; return y; } diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index 7efa7729bd85..fb4149762101 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -243,7 +243,11 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) return y; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) return y; - return x; + else if (xm > ym) + return x; + else if (xs == 0) + return x; + return y; } diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index e2c554359f7b..7915b9430f68 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -243,7 +243,11 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) return x; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) + return x; + else if (xm > ym) + return y; + else if (xs == 1) return x; return y; } From f7d36f6594b82bd2e26a6ca3254bc0bbcbe167b1 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:52 +0200 Subject: [PATCH 021/555] MIPS: math-emu: .: Fix cases of both infinite inputs commit 3444c4eb534c20e44f0d6670b34263efaf8b531f upstream. Fix the value returned by . fd,fs,ft, if both inputs are infinite. The previous implementation returned always the value contained in ft in such cases. The correct behavior is specified in Mips instruction set manual and is as follows: fs ft MAXA MINA --------------------------------- inf inf inf inf inf -inf inf -inf -inf inf inf -inf -inf -inf -inf -inf A relevant example: MAXA.S fd,fs,ft: If fs contains +inf, and ft contains -inf, fd is going to contain +inf (without this patch, it used to contain -inf). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16884/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 4 +++- arch/mips/math-emu/dp_fmin.c | 4 +++- arch/mips/math-emu/sp_fmax.c | 4 +++- arch/mips/math-emu/sp_fmin.c | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index 81d12bfa6d5e..5bec64f2884e 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -202,6 +202,9 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754dp_inf(xs & ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -209,7 +212,6 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index 4574f04b44ce..2495bd7333f5 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -202,6 +202,9 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754dp_inf(xs | ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -209,7 +212,6 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index fb4149762101..74a5a00d2f22 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -202,6 +202,9 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754sp_inf(xs & ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -209,7 +212,6 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index 7915b9430f68..42ec4315e66c 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -202,6 +202,9 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754sp_inf(xs | ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -209,7 +212,6 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): From 9381a991a36ad5fa7073f87172a859e9635e88e4 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:53 +0200 Subject: [PATCH 022/555] MIPS: math-emu: MINA.: Fix some cases of infinity and zero inputs commit 304bfe473e70523e591fb1c9223289d355e0bdcb upstream. Fix following special cases for MINA>.: - if one of the inputs is zero, and the other is subnormal, normal, or infinity, the value of the former should be returned (that is, a zero). - if one of the inputs is infinity, and the other input is normal, or subnormal, the value of the latter should be returned. The previous implementation's logic for such cases was incorrect - it appears as if it implements MAXA, and not MINA instruction. A relevant example: MINA.S fd,fs,ft: If fs contains 100.0, and ft contains 0.0, fd is going to contain 0.0 (without this patch, it used to contain 100.0). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16885/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmin.c | 4 ++-- arch/mips/math-emu/sp_fmin.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index 2495bd7333f5..a287b23818d8 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -210,14 +210,14 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): - return x; + return y; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM): - return y; + return x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): return ieee754dp_zero(xs | ys); diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index 42ec4315e66c..c51385f46b09 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -210,14 +210,14 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): - return x; + return y; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM): - return y; + return x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): return ieee754sp_zero(xs | ys); From 4e0694a6411b2a6e3cc1265ed6726c353ea73181 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Mon, 19 Jun 2017 17:50:12 +0200 Subject: [PATCH 023/555] MIPS: math-emu: Handle zero accumulator case in MADDF and MSUBF separately commit ddbfff7429a75d954bf5bdff9f2222bceb4c236a upstream. If accumulator value is zero, just return the value of previously calculated product. This brings logic in MADDF/MSUBF implementation closer to the logic in ADD/SUB case. Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Cc: James.Hogan@imgtec.com Cc: Paul.Burton@imgtec.com Cc: Raghu.Gandham@imgtec.com Cc: Leonid.Yegoshin@imgtec.com Cc: Douglas.Leung@imgtec.com Cc: Petar.Jovanovic@imgtec.com Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/16512/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_maddf.c | 5 ++++- arch/mips/math-emu/sp_maddf.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c index 4a2d03c72959..caa62f20a888 100644 --- a/arch/mips/math-emu/dp_maddf.c +++ b/arch/mips/math-emu/dp_maddf.c @@ -54,7 +54,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, return ieee754dp_nanxcpt(z); case IEEE754_CLASS_DNORM: DPDNORMZ; - /* QNAN is handled separately below */ + /* QNAN and ZERO cases are handled separately below */ } switch (CLPAIR(xc, yc)) { @@ -210,6 +210,9 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, } assert(rm & (DP_HIDDEN_BIT << 3)); + if (zc == IEEE754_CLASS_ZERO) + return ieee754dp_format(rs, re, rm); + /* And now the addition */ assert(zm & DP_HIDDEN_BIT); diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c index a8cd8b4f235e..c91d5e5d9b5f 100644 --- a/arch/mips/math-emu/sp_maddf.c +++ b/arch/mips/math-emu/sp_maddf.c @@ -54,7 +54,7 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, return ieee754sp_nanxcpt(z); case IEEE754_CLASS_DNORM: SPDNORMZ; - /* QNAN is handled separately below */ + /* QNAN and ZERO cases are handled separately below */ } switch (CLPAIR(xc, yc)) { @@ -203,6 +203,9 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, } assert(rm & (SP_HIDDEN_BIT << 3)); + if (zc == IEEE754_CLASS_ZERO) + return ieee754sp_format(rs, re, rm); + /* And now the addition */ assert(zm & SP_HIDDEN_BIT); From 4f8479c933a7f5ccdc1355f3c44fcf5d83dab776 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:54 +0200 Subject: [PATCH 024/555] MIPS: math-emu: .: Fix NaN propagation commit e840be6e7057757befc3581e1699e30fe7f0dd51 upstream. Fix the cases of . when any of three inputs is any NaN. Correct behavior of . fd, fs, ft is following: - if any of inputs is sNaN, return a sNaN using following rules: if only one input is sNaN, return that one; if more than one input is sNaN, order of precedence for return value is fd, fs, ft - if no input is sNaN, but at least one of inputs is qNaN, return a qNaN using following rules: if only one input is qNaN, return that one; if more than one input is qNaN, order of precedence for return value is fd, fs, ft The previous code contained correct handling of some above cases, but not all. Also, such handling was scattered into various cases of "switch (CLPAIR(xc, yc))" statement, and elsewhere. With this patch, this logic is placed in one place, and "switch (CLPAIR(xc, yc))" is significantly simplified. A relevant example: MADDF.S fd,fs,ft: If fs contains qNaN1, ft contains qNaN2, and fd contains qNaN3, fd is going to contain qNaN3 (without this patch, it used to contain qNaN1). Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction") Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16886/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_maddf.c | 66 +++++++++++----------------------- arch/mips/math-emu/sp_maddf.c | 68 ++++++++++++----------------------- 2 files changed, 42 insertions(+), 92 deletions(-) diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c index caa62f20a888..8b1bd42aad3c 100644 --- a/arch/mips/math-emu/dp_maddf.c +++ b/arch/mips/math-emu/dp_maddf.c @@ -48,52 +48,34 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, ieee754_clearcx(); - switch (zc) { - case IEEE754_CLASS_SNAN: - ieee754_setcx(IEEE754_INVALID_OPERATION); + /* + * Handle the cases when at least one of x, y or z is a NaN. + * Order of precedence is sNaN, qNaN and z, x, y. + */ + if (zc == IEEE754_CLASS_SNAN) return ieee754dp_nanxcpt(z); - case IEEE754_CLASS_DNORM: - DPDNORMZ; - /* QNAN and ZERO cases are handled separately below */ - } - - switch (CLPAIR(xc, yc)) { - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN): - return ieee754dp_nanxcpt(y); - - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): + if (xc == IEEE754_CLASS_SNAN) return ieee754dp_nanxcpt(x); - - case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): + if (yc == IEEE754_CLASS_SNAN) + return ieee754dp_nanxcpt(y); + if (zc == IEEE754_CLASS_QNAN) + return z; + if (xc == IEEE754_CLASS_QNAN) + return x; + if (yc == IEEE754_CLASS_QNAN) return y; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF): - return x; + if (zc == IEEE754_CLASS_DNORM) + DPDNORMZ; + /* ZERO z cases are handled separately below */ + switch (CLPAIR(xc, yc)) { /* * Infinity handling */ case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): - if (zc == IEEE754_CLASS_QNAN) - return z; ieee754_setcx(IEEE754_INVALID_OPERATION); return ieee754dp_indef(); @@ -102,8 +84,6 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): - if (zc == IEEE754_CLASS_QNAN) - return z; return ieee754dp_inf(xs ^ ys); case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): @@ -120,25 +100,19 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, DPDNORMX; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754dp_inf(zs); DPDNORMY; break; case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754dp_inf(zs); DPDNORMX; break; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754dp_inf(zs); /* fall through to real computations */ } diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c index c91d5e5d9b5f..6cdaa2a85c9d 100644 --- a/arch/mips/math-emu/sp_maddf.c +++ b/arch/mips/math-emu/sp_maddf.c @@ -48,51 +48,35 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, ieee754_clearcx(); - switch (zc) { - case IEEE754_CLASS_SNAN: - ieee754_setcx(IEEE754_INVALID_OPERATION); + /* + * Handle the cases when at least one of x, y or z is a NaN. + * Order of precedence is sNaN, qNaN and z, x, y. + */ + if (zc == IEEE754_CLASS_SNAN) return ieee754sp_nanxcpt(z); - case IEEE754_CLASS_DNORM: - SPDNORMZ; - /* QNAN and ZERO cases are handled separately below */ - } - - switch (CLPAIR(xc, yc)) { - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN): - return ieee754sp_nanxcpt(y); - - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): + if (xc == IEEE754_CLASS_SNAN) return ieee754sp_nanxcpt(x); - - case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): + if (yc == IEEE754_CLASS_SNAN) + return ieee754sp_nanxcpt(y); + if (zc == IEEE754_CLASS_QNAN) + return z; + if (xc == IEEE754_CLASS_QNAN) + return x; + if (yc == IEEE754_CLASS_QNAN) return y; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF): - return x; + if (zc == IEEE754_CLASS_DNORM) + SPDNORMZ; + /* ZERO z cases are handled separately below */ + + switch (CLPAIR(xc, yc)) { + /* * Infinity handling */ case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): - if (zc == IEEE754_CLASS_QNAN) - return z; ieee754_setcx(IEEE754_INVALID_OPERATION); return ieee754sp_indef(); @@ -101,8 +85,6 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): - if (zc == IEEE754_CLASS_QNAN) - return z; return ieee754sp_inf(xs ^ ys); case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): @@ -119,25 +101,19 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, SPDNORMX; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754sp_inf(zs); SPDNORMY; break; case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754sp_inf(zs); SPDNORMX; break; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754sp_inf(zs); /* fall through to real computations */ } From 8981bcaf9a2da24baeb76cd39d43cf2bc2569509 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:55 +0200 Subject: [PATCH 025/555] MIPS: math-emu: .: Fix some cases of infinite inputs commit 0c64fe6348687f0e1cea9a608eae9d351124a73a upstream. Fix the cases of . when any of two multiplicands is infinity. The correct behavior in such cases is affected by the nature of third input. Cases of addition of infinities with opposite signs and subtraction of infinities with same signs may arise and must be handles separately. Also, the value od flags argument (that determines whether the instruction is MADDF or MSUBF) affects the outcome. Relevant examples: MADDF.S fd,fs,ft: If fs contains +inf, ft contains +inf, and fd contains -inf, fd is going to contain indef (without this patch, it used to contain -inf). MSUBF.S fd,fs,ft: If fs contains +inf, ft contains 1.0, and fd contains +0.0, fd is going to contain -inf (without this patch, it used to contain +inf). Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction") Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction") Signed-off-by: Douglas Leung Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Douglas Leung Cc: Bo Hu Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16887/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_maddf.c | 22 +++++++++++++++++++++- arch/mips/math-emu/sp_maddf.c | 22 +++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c index 8b1bd42aad3c..557a0a118827 100644 --- a/arch/mips/math-emu/dp_maddf.c +++ b/arch/mips/math-emu/dp_maddf.c @@ -84,7 +84,27 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): - return ieee754dp_inf(xs ^ ys); + if ((zc == IEEE754_CLASS_INF) && + ((!(flags & maddf_negate_product) && (zs != (xs ^ ys))) || + ((flags & maddf_negate_product) && (zs == (xs ^ ys))))) { + /* + * Cases of addition of infinities with opposite signs + * or subtraction of infinities with same signs. + */ + ieee754_setcx(IEEE754_INVALID_OPERATION); + return ieee754dp_indef(); + } + /* + * z is here either not an infinity, or an infinity having the + * same sign as product (x*y) (in case of MADDF.D instruction) + * or product -(x*y) (in MSUBF.D case). The result must be an + * infinity, and its sign is determined only by the value of + * (flags & maddf_negate_product) and the signs of x and y. + */ + if (flags & maddf_negate_product) + return ieee754dp_inf(1 ^ (xs ^ ys)); + else + return ieee754dp_inf(xs ^ ys); case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c index 6cdaa2a85c9d..0d8d25f4b737 100644 --- a/arch/mips/math-emu/sp_maddf.c +++ b/arch/mips/math-emu/sp_maddf.c @@ -85,7 +85,27 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): - return ieee754sp_inf(xs ^ ys); + if ((zc == IEEE754_CLASS_INF) && + ((!(flags & maddf_negate_product) && (zs != (xs ^ ys))) || + ((flags & maddf_negate_product) && (zs == (xs ^ ys))))) { + /* + * Cases of addition of infinities with opposite signs + * or subtraction of infinities with same signs. + */ + ieee754_setcx(IEEE754_INVALID_OPERATION); + return ieee754sp_indef(); + } + /* + * z is here either not an infinity, or an infinity having the + * same sign as product (x*y) (in case of MADDF.D instruction) + * or product -(x*y) (in MSUBF.D case). The result must be an + * infinity, and its sign is determined only by the value of + * (flags & maddf_negate_product) and the signs of x and y. + */ + if (flags & maddf_negate_product) + return ieee754sp_inf(1 ^ (xs ^ ys)); + else + return ieee754sp_inf(xs ^ ys); case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): From d56a9caf6d836d22868404818a652fc85744774a Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:56 +0200 Subject: [PATCH 026/555] MIPS: math-emu: .: Fix some cases of zero inputs commit 7cf64ce4d37f1b4f44365fcf77f565d523819dcd upstream. Fix the cases of . when any of two multiplicands is +0 or -0, and the third input is also +0 or -0. Depending on the signs of inputs, certain special cases must be handled. A relevant example: MADDF.S fd,fs,ft: If fs contains +0.0, ft contains -0.0, and fd contains 0.0, fd is going to contain +0.0 (without this patch, it used to contain -0.0). Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction") Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16888/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_maddf.c | 18 +++++++++++++++++- arch/mips/math-emu/sp_maddf.c | 18 +++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c index 557a0a118827..c38fe1bde875 100644 --- a/arch/mips/math-emu/dp_maddf.c +++ b/arch/mips/math-emu/dp_maddf.c @@ -113,7 +113,23 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): if (zc == IEEE754_CLASS_INF) return ieee754dp_inf(zs); - /* Multiplication is 0 so just return z */ + if (zc == IEEE754_CLASS_ZERO) { + /* Handle cases +0 + (-0) and similar ones. */ + if ((!(flags & maddf_negate_product) + && (zs == (xs ^ ys))) || + ((flags & maddf_negate_product) + && (zs != (xs ^ ys)))) + /* + * Cases of addition of zeros of equal signs + * or subtraction of zeroes of opposite signs. + * The sign of the resulting zero is in any + * such case determined only by the sign of z. + */ + return z; + + return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD); + } + /* x*y is here 0, and z is not 0, so just return z */ return z; case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c index 0d8d25f4b737..4241ec19df12 100644 --- a/arch/mips/math-emu/sp_maddf.c +++ b/arch/mips/math-emu/sp_maddf.c @@ -114,7 +114,23 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): if (zc == IEEE754_CLASS_INF) return ieee754sp_inf(zs); - /* Multiplication is 0 so just return z */ + if (zc == IEEE754_CLASS_ZERO) { + /* Handle cases +0 + (-0) and similar ones. */ + if ((!(flags & maddf_negate_product) + && (zs == (xs ^ ys))) || + ((flags & maddf_negate_product) + && (zs != (xs ^ ys)))) + /* + * Cases of addition of zeros of equal signs + * or subtraction of zeroes of opposite signs. + * The sign of the resulting zero is in any + * such case determined only by the sign of z. + */ + return z; + + return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD); + } + /* x*y is here 0, and z is not 0, so just return z */ return z; case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): From 5cabf999fdb74cc5eddd463f63c5a559f0f784c0 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:57 +0200 Subject: [PATCH 027/555] MIPS: math-emu: .: Clean up "maddf_flags" enumeration commit ae11c0619973ffd73a496308d8a1cb5e1a353737 upstream. Fix definition and usage of "maddf_flags" enumeration. Avoid duplicate definition and apply more common capitalization. This patch does not change any scenario. It just makes MADDF and MSUBF emulation code more readable and easier to maintain, and hopefully prevents future bugs as well. Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16889/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_maddf.c | 19 ++++++++----------- arch/mips/math-emu/ieee754int.h | 4 ++++ arch/mips/math-emu/sp_maddf.c | 19 ++++++++----------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c index c38fe1bde875..e799fc826b0c 100644 --- a/arch/mips/math-emu/dp_maddf.c +++ b/arch/mips/math-emu/dp_maddf.c @@ -14,9 +14,6 @@ #include "ieee754dp.h" -enum maddf_flags { - maddf_negate_product = 1 << 0, -}; static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, union ieee754dp y, enum maddf_flags flags) @@ -85,8 +82,8 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): if ((zc == IEEE754_CLASS_INF) && - ((!(flags & maddf_negate_product) && (zs != (xs ^ ys))) || - ((flags & maddf_negate_product) && (zs == (xs ^ ys))))) { + ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) || + ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) { /* * Cases of addition of infinities with opposite signs * or subtraction of infinities with same signs. @@ -99,9 +96,9 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, * same sign as product (x*y) (in case of MADDF.D instruction) * or product -(x*y) (in MSUBF.D case). The result must be an * infinity, and its sign is determined only by the value of - * (flags & maddf_negate_product) and the signs of x and y. + * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y. */ - if (flags & maddf_negate_product) + if (flags & MADDF_NEGATE_PRODUCT) return ieee754dp_inf(1 ^ (xs ^ ys)); else return ieee754dp_inf(xs ^ ys); @@ -115,9 +112,9 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, return ieee754dp_inf(zs); if (zc == IEEE754_CLASS_ZERO) { /* Handle cases +0 + (-0) and similar ones. */ - if ((!(flags & maddf_negate_product) + if ((!(flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))) || - ((flags & maddf_negate_product) + ((flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys)))) /* * Cases of addition of zeros of equal signs @@ -167,7 +164,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, re = xe + ye; rs = xs ^ ys; - if (flags & maddf_negate_product) + if (flags & MADDF_NEGATE_PRODUCT) rs ^= 1; /* shunt to top of word */ @@ -291,5 +288,5 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x, union ieee754dp ieee754dp_msubf(union ieee754dp z, union ieee754dp x, union ieee754dp y) { - return _dp_maddf(z, x, y, maddf_negate_product); + return _dp_maddf(z, x, y, MADDF_NEGATE_PRODUCT); } diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h index 8bc2f6963324..dd2071f430e0 100644 --- a/arch/mips/math-emu/ieee754int.h +++ b/arch/mips/math-emu/ieee754int.h @@ -26,6 +26,10 @@ #define CLPAIR(x, y) ((x)*6+(y)) +enum maddf_flags { + MADDF_NEGATE_PRODUCT = 1 << 0, +}; + static inline void ieee754_clearcx(void) { ieee754_csr.cx = 0; diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c index 4241ec19df12..07f5a9bb1312 100644 --- a/arch/mips/math-emu/sp_maddf.c +++ b/arch/mips/math-emu/sp_maddf.c @@ -14,9 +14,6 @@ #include "ieee754sp.h" -enum maddf_flags { - maddf_negate_product = 1 << 0, -}; static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, union ieee754sp y, enum maddf_flags flags) @@ -86,8 +83,8 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): if ((zc == IEEE754_CLASS_INF) && - ((!(flags & maddf_negate_product) && (zs != (xs ^ ys))) || - ((flags & maddf_negate_product) && (zs == (xs ^ ys))))) { + ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) || + ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) { /* * Cases of addition of infinities with opposite signs * or subtraction of infinities with same signs. @@ -100,9 +97,9 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, * same sign as product (x*y) (in case of MADDF.D instruction) * or product -(x*y) (in MSUBF.D case). The result must be an * infinity, and its sign is determined only by the value of - * (flags & maddf_negate_product) and the signs of x and y. + * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y. */ - if (flags & maddf_negate_product) + if (flags & MADDF_NEGATE_PRODUCT) return ieee754sp_inf(1 ^ (xs ^ ys)); else return ieee754sp_inf(xs ^ ys); @@ -116,9 +113,9 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, return ieee754sp_inf(zs); if (zc == IEEE754_CLASS_ZERO) { /* Handle cases +0 + (-0) and similar ones. */ - if ((!(flags & maddf_negate_product) + if ((!(flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))) || - ((flags & maddf_negate_product) + ((flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys)))) /* * Cases of addition of zeros of equal signs @@ -170,7 +167,7 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, re = xe + ye; rs = xs ^ ys; - if (flags & maddf_negate_product) + if (flags & MADDF_NEGATE_PRODUCT) rs ^= 1; /* shunt to top of word */ @@ -287,5 +284,5 @@ union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x, union ieee754sp ieee754sp_msubf(union ieee754sp z, union ieee754sp x, union ieee754sp y) { - return _sp_maddf(z, x, y, maddf_negate_product); + return _sp_maddf(z, x, y, MADDF_NEGATE_PRODUCT); } From d2b488ee6f63d82bc39fa3199d6a5c71bec5ee4a Mon Sep 17 00:00:00 2001 From: Douglas Leung Date: Thu, 27 Jul 2017 18:08:58 +0200 Subject: [PATCH 028/555] MIPS: math-emu: .S: Fix accuracy (32-bit case) commit b3b8e1eb27c523e32b6a8aa7ec8ac4754456af57 upstream. Implement fused multiply-add with correct accuracy. Fused multiply-add operation has better accuracy than respective sequential execution of multiply and add operations applied on the same inputs. This is because accuracy errors accumulate in latter case. This patch implements fused multiply-add with the same accuracy as it is implemented in hardware, using 64-bit intermediate calculations. One test case example (raw bits) that this patch fixes: MADDF.S fd,fs,ft: fd = 0x22575225 fs = ft = 0x3727c5ac Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction") Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction") Signed-off-by: Douglas Leung Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Cc: Douglas Leung Cc: Bo Hu Cc: James Hogan Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16890/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/ieee754sp.h | 4 ++ arch/mips/math-emu/sp_maddf.c | 116 +++++++++++++-------------------- 2 files changed, 50 insertions(+), 70 deletions(-) diff --git a/arch/mips/math-emu/ieee754sp.h b/arch/mips/math-emu/ieee754sp.h index 8476067075fe..0f63e4202cff 100644 --- a/arch/mips/math-emu/ieee754sp.h +++ b/arch/mips/math-emu/ieee754sp.h @@ -45,6 +45,10 @@ static inline int ieee754sp_finite(union ieee754sp x) return SPBEXP(x) != SP_EMAX + 1 + SP_EBIAS; } +/* 64 bit right shift with rounding */ +#define XSPSRS64(v, rs) \ + (((rs) >= 64) ? ((v) != 0) : ((v) >> (rs)) | ((v) << (64-(rs)) != 0)) + /* 3bit extended single precision sticky right shift */ #define XSPSRS(v, rs) \ ((rs > (SP_FBITS+3))?1:((v) >> (rs)) | ((v) << (32-(rs)) != 0)) diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c index 07f5a9bb1312..7195fe785d81 100644 --- a/arch/mips/math-emu/sp_maddf.c +++ b/arch/mips/math-emu/sp_maddf.c @@ -21,14 +21,8 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, int re; int rs; unsigned rm; - unsigned short lxm; - unsigned short hxm; - unsigned short lym; - unsigned short hym; - unsigned lrm; - unsigned hrm; - unsigned t; - unsigned at; + uint64_t rm64; + uint64_t zm64; int s; COMPXSP; @@ -170,108 +164,90 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, if (flags & MADDF_NEGATE_PRODUCT) rs ^= 1; - /* shunt to top of word */ - xm <<= 32 - (SP_FBITS + 1); - ym <<= 32 - (SP_FBITS + 1); + /* Multiple 24 bit xm and ym to give 48 bit results */ + rm64 = (uint64_t)xm * ym; - /* - * Multiply 32 bits xm, ym to give high 32 bits rm with stickness. - */ - lxm = xm & 0xffff; - hxm = xm >> 16; - lym = ym & 0xffff; - hym = ym >> 16; + /* Shunt to top of word */ + rm64 = rm64 << 16; - lrm = lxm * lym; /* 16 * 16 => 32 */ - hrm = hxm * hym; /* 16 * 16 => 32 */ - - t = lxm * hym; /* 16 * 16 => 32 */ - at = lrm + (t << 16); - hrm += at < lrm; - lrm = at; - hrm = hrm + (t >> 16); - - t = hxm * lym; /* 16 * 16 => 32 */ - at = lrm + (t << 16); - hrm += at < lrm; - lrm = at; - hrm = hrm + (t >> 16); - - rm = hrm | (lrm != 0); - - /* - * Sticky shift down to normal rounding precision. - */ - if ((int) rm < 0) { - rm = (rm >> (32 - (SP_FBITS + 1 + 3))) | - ((rm << (SP_FBITS + 1 + 3)) != 0); + /* Put explicit bit at bit 62 if necessary */ + if ((int64_t) rm64 < 0) { + rm64 = rm64 >> 1; re++; - } else { - rm = (rm >> (32 - (SP_FBITS + 1 + 3 + 1))) | - ((rm << (SP_FBITS + 1 + 3 + 1)) != 0); } - assert(rm & (SP_HIDDEN_BIT << 3)); - if (zc == IEEE754_CLASS_ZERO) + assert(rm64 & (1 << 62)); + + if (zc == IEEE754_CLASS_ZERO) { + /* + * Move explicit bit from bit 62 to bit 26 since the + * ieee754sp_format code expects the mantissa to be + * 27 bits wide (24 + 3 rounding bits). + */ + rm = XSPSRS64(rm64, (62 - 26)); return ieee754sp_format(rs, re, rm); + } - /* And now the addition */ - - assert(zm & SP_HIDDEN_BIT); - - /* - * Provide guard,round and stick bit space. - */ - zm <<= 3; + /* Move explicit bit from bit 23 to bit 62 */ + zm64 = (uint64_t)zm << (62 - 23); + assert(zm64 & (1 << 62)); + /* Make the exponents the same */ if (ze > re) { /* * Have to shift r fraction right to align. */ s = ze - re; - rm = XSPSRS(rm, s); + rm64 = XSPSRS64(rm64, s); re += s; } else if (re > ze) { /* * Have to shift z fraction right to align. */ s = re - ze; - zm = XSPSRS(zm, s); + zm64 = XSPSRS64(zm64, s); ze += s; } assert(ze == re); assert(ze <= SP_EMAX); + /* Do the addition */ if (zs == rs) { /* - * Generate 28 bit result of adding two 27 bit numbers - * leaving result in zm, zs and ze. + * Generate 64 bit result by adding two 63 bit numbers + * leaving result in zm64, zs and ze. */ - zm = zm + rm; - - if (zm >> (SP_FBITS + 1 + 3)) { /* carry out */ - zm = XSPSRS1(zm); + zm64 = zm64 + rm64; + if ((int64_t)zm64 < 0) { /* carry out */ + zm64 = XSPSRS1(zm64); ze++; } } else { - if (zm >= rm) { - zm = zm - rm; + if (zm64 >= rm64) { + zm64 = zm64 - rm64; } else { - zm = rm - zm; + zm64 = rm64 - zm64; zs = rs; } - if (zm == 0) + if (zm64 == 0) return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD); /* - * Normalize in extended single precision + * Put explicit bit at bit 62 if necessary. */ - while ((zm >> (SP_MBITS + 3)) == 0) { - zm <<= 1; + while ((zm64 >> 62) == 0) { + zm64 <<= 1; ze--; } - } + + /* + * Move explicit bit from bit 62 to bit 26 since the + * ieee754sp_format code expects the mantissa to be + * 27 bits wide (24 + 3 rounding bits). + */ + zm = XSPSRS64(zm64, (62 - 26)); + return ieee754sp_format(zs, ze, zm); } From 1f143ba19a8f1956735195feffbbecfa0c7db597 Mon Sep 17 00:00:00 2001 From: Douglas Leung Date: Thu, 27 Jul 2017 18:08:59 +0200 Subject: [PATCH 029/555] MIPS: math-emu: .D: Fix accuracy (64-bit case) commit 2cfa58259f4b65b33ebe8f167019a1f89c6c3289 upstream. Implement fused multiply-add with correct accuracy. Fused multiply-add operation has better accuracy than respective sequential execution of multiply and add operations applied on the same inputs. This is because accuracy errors accumulate in latter case. This patch implements fused multiply-add with the same accuracy as it is implemented in hardware, using 128-bit intermediate calculations. One test case example (raw bits) that this patch fixes: MADDF.D fd,fs,ft: fd = 0x00000ca000000000 fs = ft = 0x3f40624dd2f1a9fc Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction") Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction") Signed-off-by: Douglas Leung Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Cc: Douglas Leung Cc: Bo Hu Cc: James Hogan Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16891/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_maddf.c | 133 ++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 39 deletions(-) diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c index e799fc826b0c..e0d9be5fbf4c 100644 --- a/arch/mips/math-emu/dp_maddf.c +++ b/arch/mips/math-emu/dp_maddf.c @@ -15,18 +15,44 @@ #include "ieee754dp.h" +/* 128 bits shift right logical with rounding. */ +void srl128(u64 *hptr, u64 *lptr, int count) +{ + u64 low; + + if (count >= 128) { + *lptr = *hptr != 0 || *lptr != 0; + *hptr = 0; + } else if (count >= 64) { + if (count == 64) { + *lptr = *hptr | (*lptr != 0); + } else { + low = *lptr; + *lptr = *hptr >> (count - 64); + *lptr |= (*hptr << (128 - count)) != 0 || low != 0; + } + *hptr = 0; + } else { + low = *lptr; + *lptr = low >> count | *hptr << (64 - count); + *lptr |= (low << (64 - count)) != 0; + *hptr = *hptr >> count; + } +} + static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, union ieee754dp y, enum maddf_flags flags) { int re; int rs; - u64 rm; unsigned lxm; unsigned hxm; unsigned lym; unsigned hym; u64 lrm; u64 hrm; + u64 lzm; + u64 hzm; u64 t; u64 at; int s; @@ -172,7 +198,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, ym <<= 64 - (DP_FBITS + 1); /* - * Multiply 64 bits xm, ym to give high 64 bits rm with stickness. + * Multiply 64 bits xm and ym to give 128 bits result in hrm:lrm. */ /* 32 * 32 => 64 */ @@ -202,81 +228,110 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, hrm = hrm + (t >> 32); - rm = hrm | (lrm != 0); - - /* - * Sticky shift down to normal rounding precision. - */ - if ((s64) rm < 0) { - rm = (rm >> (64 - (DP_FBITS + 1 + 3))) | - ((rm << (DP_FBITS + 1 + 3)) != 0); + /* Put explicit bit at bit 126 if necessary */ + if ((int64_t)hrm < 0) { + lrm = (hrm << 63) | (lrm >> 1); + hrm = hrm >> 1; re++; - } else { - rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) | - ((rm << (DP_FBITS + 1 + 3 + 1)) != 0); } - assert(rm & (DP_HIDDEN_BIT << 3)); - if (zc == IEEE754_CLASS_ZERO) - return ieee754dp_format(rs, re, rm); + assert(hrm & (1 << 62)); - /* And now the addition */ - assert(zm & DP_HIDDEN_BIT); + if (zc == IEEE754_CLASS_ZERO) { + /* + * Move explicit bit from bit 126 to bit 55 since the + * ieee754dp_format code expects the mantissa to be + * 56 bits wide (53 + 3 rounding bits). + */ + srl128(&hrm, &lrm, (126 - 55)); + return ieee754dp_format(rs, re, lrm); + } - /* - * Provide guard,round and stick bit space. - */ - zm <<= 3; + /* Move explicit bit from bit 52 to bit 126 */ + lzm = 0; + hzm = zm << 10; + assert(hzm & (1 << 62)); + /* Make the exponents the same */ if (ze > re) { /* * Have to shift y fraction right to align. */ s = ze - re; - rm = XDPSRS(rm, s); + srl128(&hrm, &lrm, s); re += s; } else if (re > ze) { /* * Have to shift x fraction right to align. */ s = re - ze; - zm = XDPSRS(zm, s); + srl128(&hzm, &lzm, s); ze += s; } assert(ze == re); assert(ze <= DP_EMAX); + /* Do the addition */ if (zs == rs) { /* - * Generate 28 bit result of adding two 27 bit numbers - * leaving result in xm, xs and xe. + * Generate 128 bit result by adding two 127 bit numbers + * leaving result in hzm:lzm, zs and ze. */ - zm = zm + rm; - - if (zm >> (DP_FBITS + 1 + 3)) { /* carry out */ - zm = XDPSRS1(zm); + hzm = hzm + hrm + (lzm > (lzm + lrm)); + lzm = lzm + lrm; + if ((int64_t)hzm < 0) { /* carry out */ + srl128(&hzm, &lzm, 1); ze++; } } else { - if (zm >= rm) { - zm = zm - rm; + if (hzm > hrm || (hzm == hrm && lzm >= lrm)) { + hzm = hzm - hrm - (lzm < lrm); + lzm = lzm - lrm; } else { - zm = rm - zm; + hzm = hrm - hzm - (lrm < lzm); + lzm = lrm - lzm; zs = rs; } - if (zm == 0) + if (lzm == 0 && hzm == 0) return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD); /* - * Normalize to rounding precision. + * Put explicit bit at bit 126 if necessary. */ - while ((zm >> (DP_FBITS + 3)) == 0) { - zm <<= 1; - ze--; + if (hzm == 0) { + /* left shift by 63 or 64 bits */ + if ((int64_t)lzm < 0) { + /* MSB of lzm is the explicit bit */ + hzm = lzm >> 1; + lzm = lzm << 63; + ze -= 63; + } else { + hzm = lzm; + lzm = 0; + ze -= 64; + } + } + + t = 0; + while ((hzm >> (62 - t)) == 0) + t++; + + assert(t <= 62); + if (t) { + hzm = hzm << t | lzm >> (64 - t); + lzm = lzm << t; + ze -= t; } } - return ieee754dp_format(zs, ze, zm); + /* + * Move explicit bit from bit 126 to bit 55 since the + * ieee754dp_format code expects the mantissa to be + * 56 bits wide (53 + 3 rounding bits). + */ + srl128(&hzm, &lzm, (126 - 55)); + + return ieee754dp_format(zs, ze, lzm); } union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x, From dcb3a4b8d7768cab59546aa2ce5dd214f7b526fe Mon Sep 17 00:00:00 2001 From: Gary R Hook Date: Tue, 25 Jul 2017 14:12:11 -0500 Subject: [PATCH 030/555] crypto: ccp - Fix XTS-AES-128 support on v5 CCPs commit e652399edba99a5497f0d80f240c9075d3b43493 upstream. Version 5 CCPs have some new requirements for XTS-AES: the type field must be specified, and the key requires 512 bits, with each part occupying 256 bits and padded with zeroes. Signed-off-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/ccp/ccp-crypto-aes-xts.c | 4 ++- drivers/crypto/ccp/ccp-dev-v5.c | 2 ++ drivers/crypto/ccp/ccp-dev.h | 2 ++ drivers/crypto/ccp/ccp-ops.c | 43 +++++++++++++++++++------ include/linux/ccp.h | 3 +- 5 files changed, 43 insertions(+), 11 deletions(-) diff --git a/drivers/crypto/ccp/ccp-crypto-aes-xts.c b/drivers/crypto/ccp/ccp-crypto-aes-xts.c index 58a4244b4752..3f26a415ef44 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-xts.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-xts.c @@ -1,8 +1,9 @@ /* * AMD Cryptographic Coprocessor (CCP) AES XTS crypto API support * - * Copyright (C) 2013 Advanced Micro Devices, Inc. + * Copyright (C) 2013,2017 Advanced Micro Devices, Inc. * + * Author: Gary R Hook * Author: Tom Lendacky * * This program is free software; you can redistribute it and/or modify @@ -164,6 +165,7 @@ static int ccp_aes_xts_crypt(struct ablkcipher_request *req, memset(&rctx->cmd, 0, sizeof(rctx->cmd)); INIT_LIST_HEAD(&rctx->cmd.entry); rctx->cmd.engine = CCP_ENGINE_XTS_AES_128; + rctx->cmd.u.xts.type = CCP_AES_TYPE_128; rctx->cmd.u.xts.action = (encrypt) ? CCP_AES_ACTION_ENCRYPT : CCP_AES_ACTION_DECRYPT; rctx->cmd.u.xts.unit_size = unit_size; diff --git a/drivers/crypto/ccp/ccp-dev-v5.c b/drivers/crypto/ccp/ccp-dev-v5.c index 2c0ce5f605b3..17b19a68e269 100644 --- a/drivers/crypto/ccp/ccp-dev-v5.c +++ b/drivers/crypto/ccp/ccp-dev-v5.c @@ -131,6 +131,7 @@ union ccp_function { #define CCP_AES_MODE(p) ((p)->aes.mode) #define CCP_AES_TYPE(p) ((p)->aes.type) #define CCP_XTS_SIZE(p) ((p)->aes_xts.size) +#define CCP_XTS_TYPE(p) ((p)->aes_xts.type) #define CCP_XTS_ENCRYPT(p) ((p)->aes_xts.encrypt) #define CCP_SHA_TYPE(p) ((p)->sha.type) #define CCP_RSA_SIZE(p) ((p)->rsa.size) @@ -318,6 +319,7 @@ static int ccp5_perform_xts_aes(struct ccp_op *op) CCP5_CMD_PROT(&desc) = 0; function.raw = 0; + CCP_XTS_TYPE(&function) = op->u.xts.type; CCP_XTS_ENCRYPT(&function) = op->u.xts.action; CCP_XTS_SIZE(&function) = op->u.xts.unit_size; CCP5_CMD_FUNCTION(&desc) = function.raw; diff --git a/drivers/crypto/ccp/ccp-dev.h b/drivers/crypto/ccp/ccp-dev.h index 8ac7ae17e1f4..e23c36c7691c 100644 --- a/drivers/crypto/ccp/ccp-dev.h +++ b/drivers/crypto/ccp/ccp-dev.h @@ -187,6 +187,7 @@ #define CCP_AES_CTX_SB_COUNT 1 #define CCP_XTS_AES_KEY_SB_COUNT 1 +#define CCP5_XTS_AES_KEY_SB_COUNT 2 #define CCP_XTS_AES_CTX_SB_COUNT 1 #define CCP_SHA_SB_COUNT 1 @@ -472,6 +473,7 @@ struct ccp_aes_op { }; struct ccp_xts_aes_op { + enum ccp_aes_type type; enum ccp_aes_action action; enum ccp_xts_aes_unit_size unit_size; }; diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c index 50fae4442801..64deb006c3be 100644 --- a/drivers/crypto/ccp/ccp-ops.c +++ b/drivers/crypto/ccp/ccp-ops.c @@ -779,6 +779,8 @@ static int ccp_run_xts_aes_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_op op; unsigned int unit_size, dm_offset; bool in_place = false; + unsigned int sb_count; + enum ccp_aes_type aestype; int ret; switch (xts->unit_size) { @@ -802,7 +804,9 @@ static int ccp_run_xts_aes_cmd(struct ccp_cmd_queue *cmd_q, return -EINVAL; } - if (xts->key_len != AES_KEYSIZE_128) + if (xts->key_len == AES_KEYSIZE_128) + aestype = CCP_AES_TYPE_128; + else return -EINVAL; if (!xts->final && (xts->src_len & (AES_BLOCK_SIZE - 1))) @@ -824,23 +828,44 @@ static int ccp_run_xts_aes_cmd(struct ccp_cmd_queue *cmd_q, op.sb_key = cmd_q->sb_key; op.sb_ctx = cmd_q->sb_ctx; op.init = 1; + op.u.xts.type = aestype; op.u.xts.action = xts->action; op.u.xts.unit_size = xts->unit_size; - /* All supported key sizes fit in a single (32-byte) SB entry - * and must be in little endian format. Use the 256-bit byte - * swap passthru option to convert from big endian to little - * endian. + /* A version 3 device only supports 128-bit keys, which fits into a + * single SB entry. A version 5 device uses a 512-bit vector, so two + * SB entries. */ + if (cmd_q->ccp->vdata->version == CCP_VERSION(3, 0)) + sb_count = CCP_XTS_AES_KEY_SB_COUNT; + else + sb_count = CCP5_XTS_AES_KEY_SB_COUNT; ret = ccp_init_dm_workarea(&key, cmd_q, - CCP_XTS_AES_KEY_SB_COUNT * CCP_SB_BYTES, + sb_count * CCP_SB_BYTES, DMA_TO_DEVICE); if (ret) return ret; - dm_offset = CCP_SB_BYTES - AES_KEYSIZE_128; - ccp_set_dm_area(&key, dm_offset, xts->key, 0, xts->key_len); - ccp_set_dm_area(&key, 0, xts->key, dm_offset, xts->key_len); + if (cmd_q->ccp->vdata->version == CCP_VERSION(3, 0)) { + /* All supported key sizes must be in little endian format. + * Use the 256-bit byte swap passthru option to convert from + * big endian to little endian. + */ + dm_offset = CCP_SB_BYTES - AES_KEYSIZE_128; + ccp_set_dm_area(&key, dm_offset, xts->key, 0, xts->key_len); + ccp_set_dm_area(&key, 0, xts->key, xts->key_len, xts->key_len); + } else { + /* Version 5 CCPs use a 512-bit space for the key: each portion + * occupies 256 bits, or one entire slot, and is zero-padded. + */ + unsigned int pad; + + dm_offset = CCP_SB_BYTES; + pad = dm_offset - xts->key_len; + ccp_set_dm_area(&key, pad, xts->key, 0, xts->key_len); + ccp_set_dm_area(&key, dm_offset + pad, xts->key, xts->key_len, + xts->key_len); + } ret = ccp_copy_to_sb(cmd_q, &key, op.jobid, op.sb_key, CCP_PASSTHRU_BYTESWAP_256BIT); if (ret) { diff --git a/include/linux/ccp.h b/include/linux/ccp.h index edc5d04b9632..1cfe5ef3060b 100644 --- a/include/linux/ccp.h +++ b/include/linux/ccp.h @@ -1,7 +1,7 @@ /* * AMD Cryptographic Coprocessor (CCP) driver * - * Copyright (C) 2013,2016 Advanced Micro Devices, Inc. + * Copyright (C) 2013,2017 Advanced Micro Devices, Inc. * * Author: Tom Lendacky * Author: Gary R Hook @@ -222,6 +222,7 @@ enum ccp_xts_aes_unit_size { * AES operation the new IV overwrites the old IV. */ struct ccp_xts_aes_engine { + enum ccp_aes_type type; enum ccp_aes_action action; enum ccp_xts_aes_unit_size unit_size; From e684db9a7cea3f2cca6545b456e1bcaea836f994 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Thu, 21 Sep 2017 10:16:53 +0200 Subject: [PATCH 031/555] crypto: AF_ALG - remove SGL terminator indicator when chaining Fixed differently upstream as commit 2d97591ef43d ("crypto: af_alg - consolidation of duplicate code") The SGL is MAX_SGL_ENTS + 1 in size. The last SG entry is used for the chaining and is properly updated with the sg_chain invocation. During the filling-in of the initial SG entries, sg_mark_end is called for each SG entry. This is appropriate as long as no additional SGL is chained with the current SGL. However, when a new SGL is chained and the last SG entry is updated with sg_chain, the last but one entry still contains the end marker from the sg_mark_end. This end marker must be removed as otherwise a walk of the chained SGLs will cause a NULL pointer dereference at the last but one SG entry, because sg_next will return NULL. The patch only applies to all kernels up to and including 4.13. The patch 2d97591ef43d0587be22ad1b0d758d6df4999a0b added to 4.14-rc1 introduced a complete new code base which addresses this bug in a different way. Yet, that patch is too invasive for stable kernels and was therefore not marked for stable. Fixes: 8ff590903d5fc ("crypto: algif_skcipher - User-space interface for skcipher operations") Signed-off-by: Stephan Mueller Acked-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/algif_skcipher.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 45af0fe00f33..aaf2f810d170 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -143,8 +143,10 @@ static int skcipher_alloc_sgl(struct sock *sk) sg_init_table(sgl->sg, MAX_SGL_ENTS + 1); sgl->cur = 0; - if (sg) + if (sg) { sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg); + sg_unmark_end(sg + (MAX_SGL_ENTS - 1)); + } list_add_tail(&sgl->list, &ctx->tsgl); } From 18d27cb70373896d618bb5dd9f124d7a994a0c3c Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Thu, 24 Aug 2017 15:19:39 -0400 Subject: [PATCH 032/555] ext4: fix incorrect quotaoff if the quota feature is enabled commit b0a5a9589decd07db755d6a8d9c0910d96ff7992 upstream. Current ext4 quota should always "usage enabled" if the quota feautre is enabled. But in ext4_orphan_cleanup(), it turn quotas off directly (used for the older journaled quota), so we cannot turn it on again via "quotaon" unless umount and remount ext4. Simple reproduce: mkfs.ext4 -O project,quota /dev/vdb1 mount -o prjquota /dev/vdb1 /mnt chattr -p 123 /mnt chattr +P /mnt touch /mnt/aa /mnt/bb exec 100<>/mnt/aa rm -f /mnt/aa sync echo c > /proc/sysrq-trigger #reboot and mount mount -o prjquota /dev/vdb1 /mnt #query status quotaon -Ppv /dev/vdb1 #output quotaon: Cannot find mountpoint for device /dev/vdb1 quotaon: No correct mountpoint specified. This patch add check for journaled quotas to avoid incorrect quotaoff when ext4 has quota feautre. Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5fa9ba1de429..3e0b4ff6e7e1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2372,7 +2372,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + /* Turn on journaled quotas so that they are updated correctly */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); @@ -2438,9 +2438,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn quotas off */ + /* Turn off journaled quotas if they were enabled for orphan cleanup */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) + if (EXT4_SB(sb)->s_qf_names[i] && sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } #endif From 3806cea5c1c509ef66f2cc8a183021ca4f871923 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Thu, 24 Aug 2017 15:21:50 -0400 Subject: [PATCH 033/555] ext4: fix quota inconsistency during orphan cleanup for read-only mounts commit 95f1fda47c9d8738f858c3861add7bf0a36a7c0b upstream. Quota does not get enabled for read-only mounts if filesystem has quota feature, so that quotas cannot updated during orphan cleanup, which will lead to quota inconsistency. This patch turn on quotas during orphan cleanup for this case, make sure quotas can be updated correctly. Reported-by: Jan Kara Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3e0b4ff6e7e1..f72535e1898f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2334,6 +2334,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA + int quota_update = 0; int i; #endif if (!es->s_last_orphan) { @@ -2372,14 +2373,32 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on journaled quotas so that they are updated correctly */ + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); - if (ret < 0) + + if (!ret) + quota_update = 1; + else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " - "quota: error %d", ret); + "quota: type %d: error %d", i, ret); } } #endif @@ -2438,10 +2457,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn off journaled quotas if they were enabled for orphan cleanup */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (EXT4_SB(sb)->s_qf_names[i] && sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -5365,6 +5386,9 @@ static int ext4_enable_quotas(struct super_block *sb) DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d). Please run " From 48564b51ac75d81f3f3b584fab8c3be44c7248a8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 24 Aug 2017 20:49:57 +1000 Subject: [PATCH 034/555] powerpc: Fix DAR reporting when alignment handler faults commit f9effe925039cf54489b5c04e0d40073bb3a123d upstream. Anton noticed that if we fault part way through emulating an unaligned instruction, we don't update the DAR to reflect that. The DAR value is eventually reported back to userspace as the address in the SEGV signal, and if userspace is using that value to demand fault then it can be confused by us not setting the value correctly. This patch is ugly as hell, but is intended to be the minimal fix and back ports easily. Signed-off-by: Michael Ellerman Reviewed-by: Paul Mackerras Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/align.c | 119 ++++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index b2da7c8baed7..292458b694fb 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -235,6 +235,28 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) #define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz)) +#define __get_user_or_set_dar(_regs, _dest, _addr) \ + ({ \ + int rc = 0; \ + typeof(_addr) __addr = (_addr); \ + if (__get_user_inatomic(_dest, __addr)) { \ + _regs->dar = (unsigned long)__addr; \ + rc = -EFAULT; \ + } \ + rc; \ + }) + +#define __put_user_or_set_dar(_regs, _src, _addr) \ + ({ \ + int rc = 0; \ + typeof(_addr) __addr = (_addr); \ + if (__put_user_inatomic(_src, __addr)) { \ + _regs->dar = (unsigned long)__addr; \ + rc = -EFAULT; \ + } \ + rc; \ + }) + static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, unsigned int reg, unsigned int nb, unsigned int flags, unsigned int instr, @@ -263,9 +285,10 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, } else { unsigned long pc = regs->nip ^ (swiz & 4); - if (__get_user_inatomic(instr, - (unsigned int __user *)pc)) + if (__get_user_or_set_dar(regs, instr, + (unsigned int __user *)pc)) return -EFAULT; + if (swiz == 0 && (flags & SW)) instr = cpu_to_le32(instr); nb = (instr >> 11) & 0x1f; @@ -309,31 +332,31 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, ((nb0 + 3) / 4) * sizeof(unsigned long)); for (i = 0; i < nb; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) + if (__get_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; if (nb0 > 0) { rptr = ®s->gpr[0]; addr += nb; for (i = 0; i < nb0; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) + if (__get_user_or_set_dar(regs, + REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; } } else { for (i = 0; i < nb; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) + if (__put_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; if (nb0 > 0) { rptr = ®s->gpr[0]; addr += nb; for (i = 0; i < nb0; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) + if (__put_user_or_set_dar(regs, + REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; } } @@ -345,29 +368,32 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, * Only POWER6 has these instructions, and it does true little-endian, * so we don't need the address swizzling. */ -static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, - unsigned int flags) +static int emulate_fp_pair(struct pt_regs *regs, unsigned char __user *addr, + unsigned int reg, unsigned int flags) { char *ptr0 = (char *) ¤t->thread.TS_FPR(reg); char *ptr1 = (char *) ¤t->thread.TS_FPR(reg+1); - int i, ret, sw = 0; + int i, sw = 0; if (reg & 1) return 0; /* invalid form: FRS/FRT must be even */ if (flags & SW) sw = 7; - ret = 0; + for (i = 0; i < 8; ++i) { if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); + if (__get_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__get_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); + if (__put_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__put_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } } - if (ret) - return -EFAULT; + return 1; /* exception handled and fixed up */ } @@ -377,24 +403,27 @@ static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr, { char *ptr0 = (char *)®s->gpr[reg]; char *ptr1 = (char *)®s->gpr[reg+1]; - int i, ret, sw = 0; + int i, sw = 0; if (reg & 1) return 0; /* invalid form: GPR must be even */ if (flags & SW) sw = 7; - ret = 0; + for (i = 0; i < 8; ++i) { if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); + if (__get_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__get_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); + if (__put_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__put_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } } - if (ret) - return -EFAULT; + return 1; /* exception handled and fixed up */ } #endif /* CONFIG_PPC64 */ @@ -687,9 +716,14 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg, for (j = 0; j < length; j += elsize) { for (i = 0; i < elsize; ++i) { if (flags & ST) - ret |= __put_user(ptr[i^sw], addr + i); + ret = __put_user_or_set_dar(regs, ptr[i^sw], + addr + i); else - ret |= __get_user(ptr[i^sw], addr + i); + ret = __get_user_or_set_dar(regs, ptr[i^sw], + addr + i); + + if (ret) + return ret; } ptr += elsize; #ifdef __LITTLE_ENDIAN__ @@ -739,7 +773,7 @@ int fix_alignment(struct pt_regs *regs) unsigned int dsisr; unsigned char __user *addr; unsigned long p, swiz; - int ret, i; + int i; union data { u64 ll; double dd; @@ -936,7 +970,7 @@ int fix_alignment(struct pt_regs *regs) if (flags & F) { /* Special case for 16-byte FP loads and stores */ PPC_WARN_ALIGNMENT(fp_pair, regs); - return emulate_fp_pair(addr, reg, flags); + return emulate_fp_pair(regs, addr, reg, flags); } else { #ifdef CONFIG_PPC64 /* Special case for 16-byte loads and stores */ @@ -966,15 +1000,12 @@ int fix_alignment(struct pt_regs *regs) } data.ll = 0; - ret = 0; p = (unsigned long)addr; for (i = 0; i < nb; i++) - ret |= __get_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; + if (__get_user_or_set_dar(regs, data.v[start + i], + SWIZ_PTR(p++))) + return -EFAULT; } else if (flags & F) { data.ll = current->thread.TS_FPR(reg); @@ -1046,15 +1077,13 @@ int fix_alignment(struct pt_regs *regs) break; } - ret = 0; p = (unsigned long)addr; for (i = 0; i < nb; i++) - ret |= __put_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); + if (__put_user_or_set_dar(regs, data.v[start + i], + SWIZ_PTR(p++))) + return -EFAULT; - if (unlikely(ret)) - return -EFAULT; } else if (flags & F) current->thread.TS_FPR(reg) = data.ll; else From 120ec1e4cdddfc16c31581c2c30511c6c16fe0fd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:44 -0700 Subject: [PATCH 035/555] block: Relax a check in blk_start_queue() commit 4ddd56b003f251091a67c15ae3fe4a5c5c5e390a upstream. Calling blk_start_queue() from interrupt context with the queue lock held and without disabling IRQs, as the skd driver does, is safe. This patch avoids that loading the skd driver triggers the following warning: WARNING: CPU: 11 PID: 1348 at block/blk-core.c:283 blk_start_queue+0x84/0xa0 RIP: 0010:blk_start_queue+0x84/0xa0 Call Trace: skd_unquiesce_dev+0x12a/0x1d0 [skd] skd_complete_internal+0x1e7/0x5a0 [skd] skd_complete_other+0xc2/0xd0 [skd] skd_isr_completion_posted.isra.30+0x2a5/0x470 [skd] skd_isr+0x14f/0x180 [skd] irq_forced_thread_fn+0x2a/0x70 irq_thread+0x144/0x1a0 kthread+0x125/0x140 ret_from_fork+0x2a/0x40 Fixes: commit a038e2536472 ("[PATCH] blk_start_queue() must be called with irq disabled - add warning") Signed-off-by: Bart Van Assche Cc: Paolo 'Blaisorblade' Giarrusso Cc: Andrew Morton Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index d1f2801ce836..95379fc83805 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -233,7 +233,7 @@ EXPORT_SYMBOL(blk_start_queue_async); **/ void blk_start_queue(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON(!in_interrupt() && !irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); From 2cee78081b97366c42bb6349be2a57f89c36df5a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 31 Aug 2017 10:23:25 +1000 Subject: [PATCH 036/555] md/bitmap: disable bitmap_resize for file-backed bitmaps. commit e8a27f836f165c26f867ece7f31eb5c811692319 upstream. bitmap_resize() does not work for file-backed bitmaps. The buffer_heads are allocated and initialized when the bitmap is read from the file, but resize doesn't read from the file, it loads from the internal bitmap. When it comes time to write the new bitmap, the bh is non-existent and we crash. The common case when growing an array involves making the array larger, and that normally means making the bitmap larger. Doing that inside the kernel is possible, but would need more code. It is probably easier to require people who use file-backed bitmaps to remove them and re-add after a reshape. So this patch disables the resizing of arrays which have file-backed bitmaps. This is better than crashing. Reported-by: Zhilong Liu Fixes: d60b479d177a ("md/bitmap: add bitmap_resize function to allow bitmap resizing.") Signed-off-by: NeilBrown Signed-off-by: Shaohua Li Signed-off-by: Greg Kroah-Hartman --- drivers/md/bitmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2d826927a3bf..fb02c3979bf4 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1992,6 +1992,11 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, long pages; struct bitmap_page *new_bp; + if (bitmap->storage.file && !init) { + pr_info("md: cannot resize file-based bitmap\n"); + return -EINVAL; + } + if (chunksize == 0) { /* If there is enough space, leave the chunk size unchanged, * else increase by factor of two until there is enough space. From cb1441bca9bf23c6cf715d16843f71f8e516993d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:45 -0700 Subject: [PATCH 037/555] skd: Avoid that module unloading triggers a use-after-free commit 7277cc67b3916eed47558c64f9c9c0de00a35cda upstream. Since put_disk() triggers a disk_release() call and since that last function calls blk_put_queue() if disk->queue != NULL, clear the disk->queue pointer before calling put_disk(). This avoids that unloading the skd kernel module triggers the following use-after-free: WARNING: CPU: 8 PID: 297 at lib/refcount.c:128 refcount_sub_and_test+0x70/0x80 refcount_t: underflow; use-after-free. CPU: 8 PID: 297 Comm: kworker/8:1 Not tainted 4.11.10-300.fc26.x86_64 #1 Workqueue: events work_for_cpu_fn Call Trace: dump_stack+0x63/0x84 __warn+0xcb/0xf0 warn_slowpath_fmt+0x5a/0x80 refcount_sub_and_test+0x70/0x80 refcount_dec_and_test+0x11/0x20 kobject_put+0x1f/0x50 blk_put_queue+0x15/0x20 disk_release+0xae/0xf0 device_release+0x32/0x90 kobject_release+0x67/0x170 kobject_put+0x2b/0x50 put_disk+0x17/0x20 skd_destruct+0x5c/0x890 [skd] skd_pci_probe+0x124d/0x13a0 [skd] local_pci_probe+0x42/0xa0 work_for_cpu_fn+0x14/0x20 process_one_work+0x19e/0x470 worker_thread+0x1dc/0x4a0 kthread+0x125/0x140 ret_from_fork+0x25/0x30 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/block/skd_main.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 3822eae102db..54ffd27b2620 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4622,15 +4622,16 @@ static void skd_free_disk(struct skd_device *skdev) { struct gendisk *disk = skdev->disk; - if (disk != NULL) { - struct request_queue *q = disk->queue; + if (disk && (disk->flags & GENHD_FL_UP)) + del_gendisk(disk); - if (disk->flags & GENHD_FL_UP) - del_gendisk(disk); - if (q) - blk_cleanup_queue(q); - put_disk(disk); + if (skdev->queue) { + blk_cleanup_queue(skdev->queue); + skdev->queue = NULL; + disk->queue = NULL; } + + put_disk(disk); skdev->disk = NULL; } From 63e606bd9551852fff026c92af4026a6e3b0d245 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:46 -0700 Subject: [PATCH 038/555] skd: Submit requests to firmware before triggering the doorbell commit 5fbd545cd3fd311ea1d6e8be4cedddd0ee5684c7 upstream. Ensure that the members of struct skd_msg_buf have been transferred to the PCIe adapter before the doorbell is triggered. This patch avoids that I/O fails sporadically and that the following error message is reported: (skd0:STM000196603:[0000:00:09.0]): Completion mismatch comp_id=0x0000 skreq=0x0400 new=0x0000 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/block/skd_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 54ffd27b2620..6f78cea75103 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2163,6 +2163,9 @@ static void skd_send_fitmsg(struct skd_device *skdev, */ qcmd |= FIT_QCMD_MSGSIZE_64; + /* Make sure skd_msg_buf is written before the doorbell is triggered. */ + smp_wmb(); + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); } @@ -2209,6 +2212,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, qcmd = skspcl->mb_dma_address; qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; + /* Make sure skd_msg_buf is written before the doorbell is triggered. */ + smp_wmb(); + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); } From 83245cd18775adaedc3badcf1ff9ba9e91cbd3ab Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:51 +0200 Subject: [PATCH 039/555] scsi: zfcp: fix queuecommand for scsi_eh commands when DIX enabled commit 71b8e45da51a7b64a23378221c0a5868bd79da4f upstream. Since commit db007fc5e20c ("[SCSI] Command protection operation"), scsi_eh_prep_cmnd() saves scmd->prot_op and temporarily resets it to SCSI_PROT_NORMAL. Other FCP LLDDs such as qla2xxx and lpfc shield their queuecommand() to only access any of scsi_prot_sg...() if (scsi_get_prot_op(cmd) != SCSI_PROT_NORMAL). Do the same thing for zfcp, which introduced DIX support with commit ef3eb71d8ba4 ("[SCSI] zfcp: Introduce experimental support for DIF/DIX"). Otherwise, TUR SCSI commands as part of scsi_eh likely fail in zfcp, because the regular SCSI command with DIX protection data, that scsi_eh re-uses in scsi_send_eh_cmnd(), of course still has (scsi_prot_sg_count() != 0) and so zfcp sends down bogus requests to the FCP channel hardware. This causes scsi_eh_test_devices() to have (finish_cmds == 0) [not SCSI device is online or not scsi_eh_tur() failed] so regular SCSI commands, that caused / were affected by scsi_eh, are moved to work_q and scsi_eh_test_devices() itself returns false. In turn, it unnecessarily escalates in our case in scsi_eh_ready_devs() beyond host reset to finally scsi_eh_offline_sdevs() which sets affected SCSI devices offline with the following kernel message: "kernel: sd H:0:T:L: Device offlined - not ready after error recovery" Signed-off-by: Steffen Maier Fixes: ef3eb71d8ba4 ("[SCSI] zfcp: Introduce experimental support for DIF/DIX") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_fsf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 27ff38f839fc..4efdb7415b03 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -2258,7 +2258,8 @@ int zfcp_fsf_fcp_cmnd(struct scsi_cmnd *scsi_cmnd) fcp_cmnd = (struct fcp_cmnd *) &req->qtcb->bottom.io.fcp_cmnd; zfcp_fc_scsi_to_fcp(fcp_cmnd, scsi_cmnd, 0); - if (scsi_prot_sg_count(scsi_cmnd)) { + if ((scsi_get_prot_op(scsi_cmnd) != SCSI_PROT_NORMAL) && + scsi_prot_sg_count(scsi_cmnd)) { zfcp_qdio_set_data_div(qdio, &req->qdio_req, scsi_prot_sg_count(scsi_cmnd)); retval = zfcp_qdio_sbals_from_sg(qdio, &req->qdio_req, From 88187de0e934df343db3193122f75d17e822bd28 Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Fri, 28 Jul 2017 12:30:52 +0200 Subject: [PATCH 040/555] scsi: zfcp: add handling for FCP_RESID_OVER to the fcp ingress path commit a099b7b1fc1f0418ab8d79ecf98153e1e134656e upstream. Up until now zfcp would just ignore the FCP_RESID_OVER flag in the FCP response IU. When this flag is set, it is possible, in regards to the FCP standard, that the storage-server processes the command normally, up to the point where data is missing and simply ignores those. In this case no CHECK CONDITION would be set, and because we ignored the FCP_RESID_OVER flag we resulted in at least a data loss or even -corruption as a follow-up error, depending on how the applications/layers on top behave. To prevent this, we now set the host-byte of the corresponding scsi_cmnd to DID_ERROR. Other storage-behaviors, where the same condition results in a CHECK CONDITION set in the answer, don't need to be changed as they are handled in the mid-layer already. Following is an example trace record decoded with zfcpdbf from the s390-tools package. We forcefully injected a fc_dl which is one byte too small: Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : rsl_err Request ID : 0x... SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x00070000 ^^DID_ERROR SCSI retries : 0x.. SCSI allowed : 0x.. SCSI scribble : 0x... SCSI opcode : 2a000000 00000000 08000000 00000000 FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000400 00000001 ^^fr_flags==FCP_RESID_OVER ^^fr_status==SAM_STAT_GOOD ^^^^^^^^fr_resid 00000000 00000000 As of now, we don't actively handle to possibility that a response IU has both flags - FCP_RESID_OVER and FCP_RESID_UNDER - set at once. Reported-by: Luke M. Hopkins Reviewed-by: Steffen Maier Fixes: 553448f6c483 ("[SCSI] zfcp: Message cleanup") Fixes: ea127f975424 ("[PATCH] s390 (7/7): zfcp host adapter.") (tglx/history.git) Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_fc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/s390/scsi/zfcp_fc.h b/drivers/s390/scsi/zfcp_fc.h index df2b541c8287..a2275825186f 100644 --- a/drivers/s390/scsi/zfcp_fc.h +++ b/drivers/s390/scsi/zfcp_fc.h @@ -4,7 +4,7 @@ * Fibre Channel related definitions and inline functions for the zfcp * device driver * - * Copyright IBM Corp. 2009 + * Copyright IBM Corp. 2009, 2017 */ #ifndef ZFCP_FC_H @@ -279,6 +279,10 @@ void zfcp_fc_eval_fcp_rsp(struct fcp_resp_with_ext *fcp_rsp, !(rsp_flags & FCP_SNS_LEN_VAL) && fcp_rsp->resp.fr_status == SAM_STAT_GOOD) set_host_byte(scsi, DID_ERROR); + } else if (unlikely(rsp_flags & FCP_RESID_OVER)) { + /* FCP_DL was not sufficient for SCSI data length */ + if (fcp_rsp->resp.fr_status == SAM_STAT_GOOD) + set_host_byte(scsi, DID_ERROR); } } From 0cbb7431a7628e4eb15808a4178c4cc46ee07526 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:53 +0200 Subject: [PATCH 041/555] scsi: zfcp: fix capping of unsuccessful GPN_FT SAN response trace records commit 975171b4461be296a35e83ebd748946b81cf0635 upstream. v4.9 commit aceeffbb59bb ("zfcp: trace full payload of all SAN records (req,resp,iels)") fixed trace data loss of 2.6.38 commit 2c55b750a884 ("[SCSI] zfcp: Redesign of the debug tracing for SAN records.") necessary for problem determination, e.g. to see the currently active zone set during automatic port scan. While it already saves space by not dumping any empty residual entries of the large successful GPN_FT response (4 pages), there are seldom cases where the GPN_FT response is unsuccessful and likely does not have FC_NS_FID_LAST set in fp_flags so we did not cap the trace record. We typically see such case for an initiator WWPN, which is not in any zone. Cap unsuccessful responses to at least the actual basic CT_IU response plus whatever fits the SAN trace record built-in "payload" buffer just in case there's trailing information of which we would at least see the existence and its beginning. In order not to erroneously cap successful responses, we need to swap calling the trace function and setting the CT / ELS status to success (0). Example trace record pair formatted with zfcpdbf: Timestamp : ... Area : SAN Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : fssct_1 Request ID : 0x Destination ID : 0x00fffffc SAN req short : 01000000 fc020000 01720ffc 00000000 00000008 SAN req length : 20 | Timestamp : ... Area : SAN Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 2 Tag : fsscth2 Request ID : 0x Destination ID : 0x00fffffc SAN resp short : 01000000 fc020000 80010000 00090700 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] SAN resp length: 16384 San resp info : 01000000 fc020000 80010000 00090700 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] The fix saves all but one of the previously associated 64 PAYload trace record chunks of size 256 bytes each. Signed-off-by: Steffen Maier Fixes: aceeffbb59bb ("zfcp: trace full payload of all SAN records (req,resp,iels)") Fixes: 2c55b750a884 ("[SCSI] zfcp: Redesign of the debug tracing for SAN records.") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.c | 10 +++++++++- drivers/s390/scsi/zfcp_fsf.c | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c index d5bf36ec8a75..31d62ea5fdcd 100644 --- a/drivers/s390/scsi/zfcp_dbf.c +++ b/drivers/s390/scsi/zfcp_dbf.c @@ -3,7 +3,7 @@ * * Debug traces for zfcp. * - * Copyright IBM Corp. 2002, 2016 + * Copyright IBM Corp. 2002, 2017 */ #define KMSG_COMPONENT "zfcp" @@ -447,6 +447,7 @@ static u16 zfcp_dbf_san_res_cap_len_if_gpn_ft(char *tag, struct fc_ct_hdr *reqh = sg_virt(ct_els->req); struct fc_ns_gid_ft *reqn = (struct fc_ns_gid_ft *)(reqh + 1); struct scatterlist *resp_entry = ct_els->resp; + struct fc_ct_hdr *resph; struct fc_gpn_ft_resp *acc; int max_entries, x, last = 0; @@ -473,6 +474,13 @@ static u16 zfcp_dbf_san_res_cap_len_if_gpn_ft(char *tag, return len; /* not GPN_FT response so do not cap */ acc = sg_virt(resp_entry); + + /* cap all but accept CT responses to at least the CT header */ + resph = (struct fc_ct_hdr *)acc; + if ((ct_els->status) || + (resph->ct_cmd != cpu_to_be16(FC_FS_ACC))) + return max(FC_CT_HDR_LEN, ZFCP_DBF_SAN_MAX_PAYLOAD); + max_entries = (reqh->ct_mr_size * 4 / sizeof(struct fc_gpn_ft_resp)) + 1 /* zfcp_fc_scan_ports: bytes correct, entries off-by-one * to account for header as 1st pseudo "entry" */; diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 4efdb7415b03..1964391db904 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -928,8 +928,8 @@ static void zfcp_fsf_send_ct_handler(struct zfcp_fsf_req *req) switch (header->fsf_status) { case FSF_GOOD: - zfcp_dbf_san_res("fsscth2", req); ct->status = 0; + zfcp_dbf_san_res("fsscth2", req); break; case FSF_SERVICE_CLASS_NOT_SUPPORTED: zfcp_fsf_class_not_supp(req); @@ -1109,8 +1109,8 @@ static void zfcp_fsf_send_els_handler(struct zfcp_fsf_req *req) switch (header->fsf_status) { case FSF_GOOD: - zfcp_dbf_san_res("fsselh1", req); send_els->status = 0; + zfcp_dbf_san_res("fsselh1", req); break; case FSF_SERVICE_CLASS_NOT_SUPPORTED: zfcp_fsf_class_not_supp(req); From 424a20b09617d293ab85a5d89afc510892c14ae3 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:54 +0200 Subject: [PATCH 042/555] scsi: zfcp: fix passing fsf_req to SCSI trace on TMF to correlate with HBA commit 9fe5d2b2fd30aa8c7827ec62cbbe6d30df4fe3e3 upstream. Without this fix we get SCSI trace records on task management functions which cannot be correlated to HBA trace records because all fields related to the FSF request are empty (zero). Also, the FCP_RSP_IU is missing as well as any sense data if available. This was caused by v2.6.14 commit 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") introducing trace records for TMFs but hard coding NULL for a possibly existing TMF FSF request. The scsi_cmnd scribble is also zero or unrelated for the TMF request so it also could not lookup a suitable FSF request from there. A broken example trace record formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : SCSI Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : lr_fail Request ID : 0x0000000000000000 ^^^^^^^^^^^^^^^^ no correlation to HBA record SCSI ID : 0x SCSI LUN : 0x SCSI result : 0x000e0000 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x0000000000000000 SCSI opcode : 2a000017 3bb80000 08000000 00000000 FCP rsp inf cod: 0x00 ^^ no TMF response FCP rsp IU : 00000000 00000000 00000000 00000000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 00000000 00000000 ^^^^^^^^^^^^^^^^^ no interesting FCP_RSP_IU Sense len : ... ^^^^^^^^^^^^^^^^^^^^ no sense data length Sense info : ... ^^^^^^^^^^^^^^^^^^^^ no sense data content, even if present There are some true cases where we really do not have an FSF request: "rsl_fai" from zfcp_dbf_scsi_fail_send() called for early returns / completions in zfcp_scsi_queuecommand(), "abrt_or", "abrt_bl", "abrt_ru", "abrt_ar" from zfcp_scsi_eh_abort_handler() where we did not get as far, "lr_nres", "tr_nres" from zfcp_task_mgmt_function() where we're successful and do not need to do anything because adapter stopped. For these cases it's correct to pass NULL for fsf_req to _zfcp_dbf_scsi(). Signed-off-by: Steffen Maier Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.h | 7 ++++--- drivers/s390/scsi/zfcp_scsi.c | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/s390/scsi/zfcp_dbf.h b/drivers/s390/scsi/zfcp_dbf.h index db186d44cfaf..776d1ac125ff 100644 --- a/drivers/s390/scsi/zfcp_dbf.h +++ b/drivers/s390/scsi/zfcp_dbf.h @@ -2,7 +2,7 @@ * zfcp device driver * debug feature declarations * - * Copyright IBM Corp. 2008, 2016 + * Copyright IBM Corp. 2008, 2017 */ #ifndef ZFCP_DBF_H @@ -401,7 +401,8 @@ void zfcp_dbf_scsi_abort(char *tag, struct scsi_cmnd *scmd, * @flag: indicates type of reset (Target Reset, Logical Unit Reset) */ static inline -void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag) +void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag, + struct zfcp_fsf_req *fsf_req) { char tmp_tag[ZFCP_DBF_TAG_LEN]; @@ -411,7 +412,7 @@ void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag) memcpy(tmp_tag, "lr_", 3); memcpy(&tmp_tag[3], tag, 4); - _zfcp_dbf_scsi(tmp_tag, 1, scmnd, NULL); + _zfcp_dbf_scsi(tmp_tag, 1, scmnd, fsf_req); } /** diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index 07ffdbb5107f..ecce7e858af9 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -3,7 +3,7 @@ * * Interface to Linux SCSI midlayer. * - * Copyright IBM Corp. 2002, 2016 + * Copyright IBM Corp. 2002, 2017 */ #define KMSG_COMPONENT "zfcp" @@ -278,7 +278,7 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags) if (!(atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_RUNNING)) { - zfcp_dbf_scsi_devreset("nres", scpnt, tm_flags); + zfcp_dbf_scsi_devreset("nres", scpnt, tm_flags, NULL); return SUCCESS; } } @@ -288,10 +288,10 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags) wait_for_completion(&fsf_req->completion); if (fsf_req->status & ZFCP_STATUS_FSFREQ_TMFUNCFAILED) { - zfcp_dbf_scsi_devreset("fail", scpnt, tm_flags); + zfcp_dbf_scsi_devreset("fail", scpnt, tm_flags, fsf_req); retval = FAILED; } else { - zfcp_dbf_scsi_devreset("okay", scpnt, tm_flags); + zfcp_dbf_scsi_devreset("okay", scpnt, tm_flags, fsf_req); zfcp_scsi_forget_cmnds(zfcp_sdev, tm_flags); } From 8d706e3dd8ab496161b2b0e9367a7307df416205 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:55 +0200 Subject: [PATCH 043/555] scsi: zfcp: fix missing trace records for early returns in TMF eh handlers commit 1a5d999ebfc7bfe28deb48931bb57faa8e4102b6 upstream. For problem determination we need to see that we were in scsi_eh as well as whether and why we were successful or not. The following commits introduced new early returns without adding a trace record: v2.6.35 commit a1dbfddd02d2 ("[SCSI] zfcp: Pass return code from fc_block_scsi_eh to scsi eh") on fc_block_scsi_eh() returning != 0 which is FAST_IO_FAIL, v2.6.30 commit 63caf367e1c9 ("[SCSI] zfcp: Improve reliability of SCSI eh handlers in zfcp") on not having gotten an FSF request after the maximum number of retry attempts and thus could not issue a TMF and has to return FAILED. Signed-off-by: Steffen Maier Fixes: a1dbfddd02d2 ("[SCSI] zfcp: Pass return code from fc_block_scsi_eh to scsi eh") Fixes: 63caf367e1c9 ("[SCSI] zfcp: Improve reliability of SCSI eh handlers in zfcp") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_scsi.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index ecce7e858af9..9bd9b9a29dfc 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -273,8 +273,10 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags) zfcp_erp_wait(adapter); ret = fc_block_scsi_eh(scpnt); - if (ret) + if (ret) { + zfcp_dbf_scsi_devreset("fiof", scpnt, tm_flags, NULL); return ret; + } if (!(atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_RUNNING)) { @@ -282,8 +284,10 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags) return SUCCESS; } } - if (!fsf_req) + if (!fsf_req) { + zfcp_dbf_scsi_devreset("reqf", scpnt, tm_flags, NULL); return FAILED; + } wait_for_completion(&fsf_req->completion); From 5283787709f8abd59bb3386337772e565db536cc Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:56 +0200 Subject: [PATCH 044/555] scsi: zfcp: fix payload with full FCP_RSP IU in SCSI trace records commit 12c3e5754c8022a4f2fd1e9f00d19e99ee0d3cc1 upstream. If the FCP_RSP UI has optional parts (FCP_SNS_INFO or FCP_RSP_INFO) and thus does not fit into the fsp_rsp field built into a SCSI trace record, trace the full FCP_RSP UI with all optional parts as payload record instead of just FCP_SNS_INFO as payload and a 1 byte RSP_INFO_CODE part of FCP_RSP_INFO built into the SCSI record. That way we would also get the full FCP_SNS_INFO in case a target would ever send more than min(SCSI_SENSE_BUFFERSIZE==96, ZFCP_DBF_PAY_MAX_REC==256)==96. The mandatory part of FCP_RSP IU is only 24 bytes. PAYload costs at least one full PAY record of 256 bytes anyway. We cap to the hardware response size which is only FSF_FCP_RSP_SIZE==128. So we can just put the whole FCP_RSP IU with any optional parts into PAYload similarly as we do for SAN PAY since v4.9 commit aceeffbb59bb ("zfcp: trace full payload of all SAN records (req,resp,iels)"). This does not cause any additional trace records wasting memory. Decoded trace records were confusing because they showed a hard-coded sense data length of 96 even if the FCP_RSP_IU field FCP_SNS_LEN showed actually less. Since the same commit, we set pl_len for SAN traces to the full length of a request/response even if we cap the corresponding trace. In contrast, here for SCSI traces we set pl_len to the pre-computed length of FCP_RSP IU considering SNS_LEN or RSP_LEN if valid. Nonetheless we trace a hardcoded payload of length FSF_FCP_RSP_SIZE==128 if there were optional parts. This makes it easier for the zfcpdbf tool to format only the relevant part of the long FCP_RSP UI buffer. And any trailing information is still available in the payload trace record just in case. Rename the payload record tag from "fcp_sns" to "fcp_riu" to make the new content explicit to zfcpdbf which can then pick a suitable field name such as "FCP rsp IU all:" instead of "Sense info :" Also, the same zfcpdbf can still be backwards compatible with "fcp_sns". Old example trace record before this fix, formatted with the tool zfcpdbf from s390-tools: Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU id : .. Caller : 0x... Record id : 1 Tag : rsl_err Request id : 0x SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x00000002 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x SCSI opcode : 00000000 00000000 00000000 00000000 FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000202 00000000 ^^==FCP_SNS_LEN_VALID 00000020 00000000 ^^^^^^^^==FCP_SNS_LEN==32 Sense len : 96 <==min(SCSI_SENSE_BUFFERSIZE,ZFCP_DBF_PAY_MAX_REC) Sense info : 70000600 00000018 00000000 29000000 00000400 00000000 00000000 00000000 00000000 00000000 00000000 00000000<==superfluous 00000000 00000000 00000000 00000000<==superfluous 00000000 00000000 00000000 00000000<==superfluous 00000000 00000000 00000000 00000000<==superfluous New example trace records with this fix: Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : rsl_err Request ID : 0x SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x00000002 SCSI retries : 0x00 SCSI allowed : 0x03 SCSI scribble : 0x SCSI opcode : a30c0112 00000000 02000000 00000000 FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000a02 00000200 00000020 00000000 FCP rsp IU len : 56 FCP rsp IU all : 00000000 00000000 00000a02 00000200 ^^=FCP_RESID_UNDER|FCP_SNS_LEN_VALID 00000020 00000000 70000500 00000018 ^^^^^^^^==FCP_SNS_LEN ^^^^^^^^^^^^^^^^^ 00000000 240000cb 00011100 00000000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 00000000 00000000 ^^^^^^^^^^^^^^^^^==FCP_SNS_INFO Timestamp : ... Area : SCSI Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : lr_okay Request ID : 0x SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x00000000 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x SCSI opcode : FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000100 00000000 00000000 00000008 FCP rsp IU len : 32 FCP rsp IU all : 00000000 00000000 00000100 00000000 ^^==FCP_RSP_LEN_VALID 00000000 00000008 00000000 00000000 ^^^^^^^^==FCP_RSP_LEN ^^^^^^^^^^^^^^^^^==FCP_RSP_INFO Signed-off-by: Steffen Maier Fixes: 250a1352b95e ("[SCSI] zfcp: Redesign of the debug tracing for SCSI records.") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c index 31d62ea5fdcd..c801f9782cb2 100644 --- a/drivers/s390/scsi/zfcp_dbf.c +++ b/drivers/s390/scsi/zfcp_dbf.c @@ -572,19 +572,32 @@ void zfcp_dbf_scsi(char *tag, int level, struct scsi_cmnd *sc, if (fsf) { rec->fsf_req_id = fsf->req_id; + rec->pl_len = FCP_RESP_WITH_EXT; fcp_rsp = (struct fcp_resp_with_ext *) &(fsf->qtcb->bottom.io.fcp_rsp); + /* mandatory parts of FCP_RSP IU in this SCSI record */ memcpy(&rec->fcp_rsp, fcp_rsp, FCP_RESP_WITH_EXT); if (fcp_rsp->resp.fr_flags & FCP_RSP_LEN_VAL) { fcp_rsp_info = (struct fcp_resp_rsp_info *) &fcp_rsp[1]; rec->fcp_rsp_info = fcp_rsp_info->rsp_code; + rec->pl_len += be32_to_cpu(fcp_rsp->ext.fr_rsp_len); } if (fcp_rsp->resp.fr_flags & FCP_SNS_LEN_VAL) { - rec->pl_len = min((u16)SCSI_SENSE_BUFFERSIZE, - (u16)ZFCP_DBF_PAY_MAX_REC); - zfcp_dbf_pl_write(dbf, sc->sense_buffer, rec->pl_len, - "fcp_sns", fsf->req_id); + rec->pl_len += be32_to_cpu(fcp_rsp->ext.fr_sns_len); } + /* complete FCP_RSP IU in associated PAYload record + * but only if there are optional parts + */ + if (fcp_rsp->resp.fr_flags != 0) + zfcp_dbf_pl_write( + dbf, fcp_rsp, + /* at least one full PAY record + * but not beyond hardware response field + */ + min_t(u16, max_t(u16, rec->pl_len, + ZFCP_DBF_PAY_MAX_REC), + FSF_FCP_RSP_SIZE), + "fcp_riu", fsf->req_id); } debug_event(dbf->scsi, level, rec, sizeof(*rec)); From adbbbd349e800921689eccf4fdadea4450849b4e Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:57 +0200 Subject: [PATCH 045/555] scsi: zfcp: trace HBA FSF response by default on dismiss or timedout late response commit fdb7cee3b9e3c561502e58137a837341f10cbf8b upstream. At the default trace level, we only trace unsuccessful events including FSF responses. zfcp_dbf_hba_fsf_response() only used protocol status and FSF status to decide on an unsuccessful response. However, this is only one of multiple possible sources determining a failed struct zfcp_fsf_req. An FSF request can also "fail" if its response runs into an ERP timeout or if it gets dismissed because a higher level recovery was triggered [trace tags "erscf_1" or "erscf_2" in zfcp_erp_strategy_check_fsfreq()]. FSF requests with ERP timeout are: FSF_QTCB_EXCHANGE_CONFIG_DATA, FSF_QTCB_EXCHANGE_PORT_DATA, FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT or FSF_QTCB_CLOSE_PHYSICAL_PORT for target ports, FSF_QTCB_OPEN_LUN, FSF_QTCB_CLOSE_LUN. One example is slow queue processing which can cause follow-on errors, e.g. FSF_PORT_ALREADY_OPEN after FSF_QTCB_OPEN_PORT_WITH_DID timed out. In order to see the root cause, we need to see late responses even if the channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD. Example trace records formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : REC Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : fcegpf1 LUN : 0xffffffffffffffff WWPN : 0x D_ID : 0x00 Adapter status : 0x5400050b Port status : 0x41200000 LUN status : 0x00000000 Ready count : 0x00000001 Running count : 0x... ERP want : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT ERP need : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT | Timestamp : ... 30 seconds later Area : REC Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : ... Record ID : 2 Tag : erscf_2 LUN : 0xffffffffffffffff WWPN : 0x D_ID : 0x00 Adapter status : 0x5400050b Port status : 0x41200000 LUN status : 0x00000000 Request ID : 0x ERP status : 0x10000000 ZFCP_STATUS_ERP_TIMEDOUT ERP step : 0x0800 ZFCP_ERP_STEP_PORT_OPENING ERP action : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT ERP count : 0x00 | Timestamp : ... later than previous record Area : HBA Subarea : 00 Level : 5 > default level => 3 <= default level Exception : - CPU ID : 00 Caller : ... Record ID : 1 Tag : fs_qtcb => fs_rerr Request ID : 0x Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED | ZFCP_STATUS_FSFREQ_CLEANUP FSF cmnd : 0x00000005 FSF sequence no: 0x... FSF issued : ... > 30 seconds ago FSF stat : 0x00000000 FSF_GOOD FSF stat qual : 00000000 00000000 00000000 00000000 Prot stat : 0x00000001 FSF_PROT_GOOD Prot stat qual : 00000000 00000000 00000000 00000000 Port handle : 0x... LUN handle : 0x00000000 QTCB log length: ... QTCB log info : ... In case of problems detecting that new responses are waiting on the input queue, we sooner or later trigger adapter recovery due to an FSF request timeout (trace tag "fsrth_1"). FSF requests with FSF request timeout are: typically FSF_QTCB_ABORT_FCP_CMND; but theoretically also FSF_QTCB_EXCHANGE_CONFIG_DATA or FSF_QTCB_EXCHANGE_PORT_DATA via sysfs, FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT for WKA ports, FSF_QTCB_FCP_CMND for task management function (LUN / target reset). One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD because the channel filled in the response via DMA into the request's QTCB. In a theroretical case, inject code can create an erroneous FSF request on purpose. If data router is enabled, it uses deferred error reporting. A READ SCSI command can succeed with FSF_PROT_GOOD, FSF_GOOD, and SAM_STAT_GOOD. But on writing the read data to host memory via DMA, it can still fail, e.g. if an intentionally wrong scatter list does not provide enough space. Rather than getting an unsuccessful response, we get a QDIO activate check which in turn triggers adapter recovery. One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD because the channel filled in the response via DMA into the request's QTCB. Example trace records formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : HBA Subarea : 00 Level : 6 > default level => 3 <= default level Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : fs_norm => fs_rerr Request ID : 0x Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED | ZFCP_STATUS_FSFREQ_CLEANUP FSF cmnd : 0x00000001 FSF sequence no: 0x... FSF issued : ... FSF stat : 0x00000000 FSF_GOOD FSF stat qual : 00000000 00000000 00000000 00000000 Prot stat : 0x00000001 FSF_PROT_GOOD Prot stat qual : ........ ........ 00000000 00000000 Port handle : 0x... LUN handle : 0x... | Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : rsl_err Request ID : 0x SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x000e0000 DID_TRANSPORT_DISRUPTED SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x SCSI opcode : 28... Read(10) FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000000 00000000 ^^ SAM_STAT_GOOD 00000000 00000000 Only with luck in both above cases, we could see a follow-on trace record of an unsuccesful event following a successful but late FSF response with FSF_PROT_GOOD and FSF_GOOD. Typically this was the case for I/O requests resulting in a SCSI trace record "rsl_err" with DID_TRANSPORT_DISRUPTED [On ZFCP_STATUS_FSFREQ_DISMISSED, zfcp_fsf_protstatus_eval() sets ZFCP_STATUS_FSFREQ_ERROR seen by the request handler functions as failure]. However, the reason for this follow-on trace was invisible because the corresponding HBA trace record was missing at the default trace level (by default hidden records with tags "fs_norm", "fs_qtcb", or "fs_open"). On adapter recovery, after we had shut down the QDIO queues, we perform unsuccessful pseudo completions with flag ZFCP_STATUS_FSFREQ_DISMISSED for each pending FSF request in zfcp_fsf_req_dismiss_all(). In order to find the root cause, we need to see all pseudo responses even if the channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD. Therefore, check zfcp_fsf_req.status for ZFCP_STATUS_FSFREQ_DISMISSED or ZFCP_STATUS_FSFREQ_ERROR and trace with a new tag "fs_rerr". It does not matter that there are numerous places which set ZFCP_STATUS_FSFREQ_ERROR after the location where we trace an FSF response early. These cases are based on protocol status != FSF_PROT_GOOD or == FSF_PROT_FSF_STATUS_PRESENTED and are thus already traced by default as trace tag "fs_perr" or "fs_ferr" respectively. NB: The trace record with tag "fssrh_1" for status read buffers on dismiss all remains. zfcp_fsf_req_complete() handles this and returns early. All other FSF request types are handled separately and as described above. Signed-off-by: Steffen Maier Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") Fixes: 2e261af84cdb ("[SCSI] zfcp: Only collect FSF/HBA debug data for matching trace levels") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/s390/scsi/zfcp_dbf.h b/drivers/s390/scsi/zfcp_dbf.h index 776d1ac125ff..8e7f8e6037d2 100644 --- a/drivers/s390/scsi/zfcp_dbf.h +++ b/drivers/s390/scsi/zfcp_dbf.h @@ -323,7 +323,11 @@ void zfcp_dbf_hba_fsf_response(struct zfcp_fsf_req *req) { struct fsf_qtcb *qtcb = req->qtcb; - if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) && + if (unlikely(req->status & (ZFCP_STATUS_FSFREQ_DISMISSED | + ZFCP_STATUS_FSFREQ_ERROR))) { + zfcp_dbf_hba_fsf_resp("fs_rerr", 3, req); + + } else if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) && (qtcb->prefix.prot_status != FSF_PROT_FSF_STATUS_PRESENTED)) { zfcp_dbf_hba_fsf_resp("fs_perr", 1, req); From c24f722a82b1648caeb201811673d2330ae8df6f Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:58 +0200 Subject: [PATCH 046/555] scsi: zfcp: trace high part of "new" 64 bit SCSI LUN commit 5d4a3d0a2ff23799b956e5962b886287614e7fad upstream. Complements debugging aspects of the otherwise functionally complete v3.17 commit 9cb78c16f5da ("scsi: use 64-bit LUNs"). While I don't have access to a target exporting 3 or 4 level LUNs, I did test it by explicitly attaching a non-existent fake 4 level LUN by means of zfcp sysfs attribute "unit_add". In order to see corresponding trace records of otherwise successful events, we had to increase the trace level of area SCSI and HBA to 6. $ echo 6 > /sys/kernel/debug/s390dbf/zfcp_0.0.1880_scsi/level $ echo 6 > /sys/kernel/debug/s390dbf/zfcp_0.0.1880_hba/level $ echo 0x4011402240334044 > \ /sys/bus/ccw/drivers/zfcp/0.0.1880/0x50050763031bd327/unit_add Example output formatted by an updated zfcpdbf from the s390-tools package interspersed with kernel messages at scsi_logging_level=4605: Timestamp : ... Area : REC Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : scsla_1 LUN : 0x4011402240334044 WWPN : 0x50050763031bd327 D_ID : 0x00...... Adapter status : 0x5400050b Port status : 0x54000001 LUN status : 0x41000000 Ready count : 0x00000001 Running count : 0x00000000 ERP want : 0x01 ERP need : 0x01 scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY pass 1 length 36 scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY successful with code 0x0 Timestamp : ... Area : HBA Subarea : 00 Level : 6 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : fs_norm Request ID : 0x Request status : 0x00000010 FSF cmnd : 0x00000001 FSF sequence no: 0x... FSF issued : ... FSF stat : 0x00000000 FSF stat qual : 00000000 00000000 00000000 00000000 Prot stat : 0x00000001 Prot stat qual : ........ ........ 00000000 00000000 Port handle : 0x... LUN handle : 0x... | Timestamp : ... Area : SCSI Subarea : 00 Level : 6 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : rsl_nor Request ID : 0x SCSI ID : 0x00000000 SCSI LUN : 0x40224011 SCSI LUN high : 0x40444033 <======================= SCSI result : 0x00000000 SCSI retries : 0x00 SCSI allowed : 0x03 SCSI scribble : 0x SCSI opcode : 12000000 a4000000 00000000 00000000 FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000000 00000000 00000000 00000000 scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY pass 2 length 164 scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY successful with code 0x0 scsi 2:0:0:4630896905707208721: scsi scan: peripheral device type of 31, \ no device added Signed-off-by: Steffen Maier Fixes: 9cb78c16f5da ("scsi: use 64-bit LUNs") Reviewed-by: Benjamin Block Reviewed-by: Jens Remus Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.c | 2 +- drivers/s390/scsi/zfcp_dbf.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c index c801f9782cb2..34367d172961 100644 --- a/drivers/s390/scsi/zfcp_dbf.c +++ b/drivers/s390/scsi/zfcp_dbf.c @@ -563,8 +563,8 @@ void zfcp_dbf_scsi(char *tag, int level, struct scsi_cmnd *sc, rec->scsi_retries = sc->retries; rec->scsi_allowed = sc->allowed; rec->scsi_id = sc->device->id; - /* struct zfcp_dbf_scsi needs to be updated to handle 64bit LUNs */ rec->scsi_lun = (u32)sc->device->lun; + rec->scsi_lun_64_hi = (u32)(sc->device->lun >> 32); rec->host_scribble = (unsigned long)sc->host_scribble; memcpy(rec->scsi_opcode, sc->cmnd, diff --git a/drivers/s390/scsi/zfcp_dbf.h b/drivers/s390/scsi/zfcp_dbf.h index 8e7f8e6037d2..b60667c145fd 100644 --- a/drivers/s390/scsi/zfcp_dbf.h +++ b/drivers/s390/scsi/zfcp_dbf.h @@ -204,7 +204,7 @@ enum zfcp_dbf_scsi_id { * @id: unique number of recovery record type * @tag: identifier string specifying the location of initiation * @scsi_id: scsi device id - * @scsi_lun: scsi device logical unit number + * @scsi_lun: scsi device logical unit number, low part of 64 bit, old 32 bit * @scsi_result: scsi result * @scsi_retries: current retry number of scsi request * @scsi_allowed: allowed retries @@ -214,6 +214,7 @@ enum zfcp_dbf_scsi_id { * @host_scribble: LLD specific data attached to SCSI request * @pl_len: length of paload stored as zfcp_dbf_pay * @fsf_rsp: response for fsf request + * @scsi_lun_64_hi: scsi device logical unit number, high part of 64 bit */ struct zfcp_dbf_scsi { u8 id; @@ -230,6 +231,7 @@ struct zfcp_dbf_scsi { u64 host_scribble; u16 pl_len; struct fcp_resp_with_ext fcp_rsp; + u32 scsi_lun_64_hi; } __packed; /** From 7efc41514a0190b6fa942b4cf740306d00c0e4c8 Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Wed, 23 Aug 2017 04:46:56 -0700 Subject: [PATCH 047/555] scsi: megaraid_sas: set minimum value of resetwaittime to be 1 secs commit e636a7a430f41efb0ff2727960ce61ef9f8f6769 upstream. Setting resetwaittime to 0 during a FW fault will result in driver not calling the OCR. Signed-off-by: Kashyap Desai Signed-off-by: Shivasharan S Reviewed-by: Hannes Reinecke Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index d8b1fbd4c8aa..5c104545aa91 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5290,7 +5290,8 @@ static int megasas_init_fw(struct megasas_instance *instance) instance->throttlequeuedepth = MEGASAS_THROTTLE_QUEUE_DEPTH; - if (resetwaittime > MEGASAS_RESET_WAIT_TIME) + if ((resetwaittime < 1) || + (resetwaittime > MEGASAS_RESET_WAIT_TIME)) resetwaittime = MEGASAS_RESET_WAIT_TIME; if ((scmd_timeout < 10) || (scmd_timeout > MEGASAS_DEFAULT_CMD_TIMEOUT)) From c62da79e1be59c8ab426ec0c0cb8ea2a9800a741 Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Wed, 23 Aug 2017 04:47:01 -0700 Subject: [PATCH 048/555] scsi: megaraid_sas: Check valid aen class range to avoid kernel panic commit 91b3d9f0069c8307d0b3a4c6843b65a439183318 upstream. Signed-off-by: Kashyap Desai Signed-off-by: Shivasharan S Reviewed-by: Hannes Reinecke Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 5c104545aa91..2f01cab48333 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5460,6 +5460,14 @@ megasas_register_aen(struct megasas_instance *instance, u32 seq_num, prev_aen.word = le32_to_cpu(instance->aen_cmd->frame->dcmd.mbox.w[1]); + if ((curr_aen.members.class < MFI_EVT_CLASS_DEBUG) || + (curr_aen.members.class > MFI_EVT_CLASS_DEAD)) { + dev_info(&instance->pdev->dev, + "%s %d out of range class %d send by application\n", + __func__, __LINE__, curr_aen.members.class); + return 0; + } + /* * A class whose enum value is smaller is inclusive of all * higher values. If a PROGRESS (= -1) was previously From d8817f5f2937a95d7ad8f9fefbc22045b8cd2e38 Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Wed, 23 Aug 2017 04:47:04 -0700 Subject: [PATCH 049/555] scsi: megaraid_sas: Return pended IOCTLs with cmd_status MFI_STAT_WRONG_STATE in case adapter is dead commit eb3fe263a48b0d27b229c213929c4cb3b1b39a0f upstream. After a kill adapter, since the cmd_status is not set, the IOCTLs will be hung in driver resulting in application hang. Set cmd_status MFI_STAT_WRONG_STATE when completing pended IOCTLs. Signed-off-by: Kashyap Desai Signed-off-by: Shivasharan S Reviewed-by: Hannes Reinecke Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 2f01cab48333..35cbd36f8d3b 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -1901,9 +1901,12 @@ static void megasas_complete_outstanding_ioctls(struct megasas_instance *instanc if (cmd_fusion->sync_cmd_idx != (u32)ULONG_MAX) { cmd_mfi = instance->cmd_list[cmd_fusion->sync_cmd_idx]; if (cmd_mfi->sync_cmd && - cmd_mfi->frame->hdr.cmd != MFI_CMD_ABORT) + (cmd_mfi->frame->hdr.cmd != MFI_CMD_ABORT)) { + cmd_mfi->frame->hdr.cmd_status = + MFI_STAT_WRONG_STATE; megasas_complete_cmd(instance, cmd_mfi, DID_OK); + } } } } else { From 5b8f80d34abfd41fc6fd1c92c669138c482ce55e Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 28 Aug 2017 17:43:59 -0700 Subject: [PATCH 050/555] scsi: storvsc: fix memory leak on ring buffer busy commit 0208eeaa650c5c866a3242201678a19e6dc4a14e upstream. When storvsc is sending I/O to Hyper-v, it may allocate a bigger buffer descriptor for large data payload that can't fit into a pre-allocated buffer descriptor. This bigger buffer is freed on return path. If I/O request to Hyper-v fails due to ring buffer busy, the storvsc allocated buffer descriptor should also be freed. [mkp: applied by hand] Fixes: be0cf6ca301c ("scsi: storvsc: Set the tablesize based on the information given by the host") Signed-off-by: Long Li Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/storvsc_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index c5ab1b0037fc..2bf96d33428a 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1559,6 +1559,8 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) ret = storvsc_do_io(dev, cmd_request); if (ret == -EAGAIN) { + if (payload_sz > sizeof(cmd_request->mpb)) + kfree(payload); /* no more space */ return SCSI_MLQUEUE_DEVICE_BUSY; } From 91fb151822d0808b8e691d9e11ac44b0f020203e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 7 Apr 2017 09:34:13 +0200 Subject: [PATCH 051/555] scsi: sg: remove 'save_scat_len' commit 136e57bf43dc4babbfb8783abbf707d483cacbe3 upstream. Unused. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9236a13d5d2a..9aac35006ff7 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -146,7 +146,6 @@ typedef struct sg_fd { /* holds the state of a file descriptor */ int timeout; /* defaults to SG_DEFAULT_TIMEOUT */ int timeout_user; /* defaults to SG_DEFAULT_TIMEOUT_USER */ Sg_scatter_hold reserve; /* buffer held for this file descriptor */ - unsigned save_scat_len; /* original length of trunc. scat. element */ Sg_request *headrp; /* head of request slist, NULL->empty */ struct fasync_struct *async_qp; /* used by asynchronous notification */ Sg_request req_arr[SG_MAX_QUEUE]; /* used as singly-linked list */ @@ -2049,7 +2048,6 @@ sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp) req_schp->pages = NULL; req_schp->page_order = 0; req_schp->sglist_len = 0; - sfp->save_scat_len = 0; srp->res_used = 0; /* Called without mutex lock to avoid deadlock */ sfp->res_in_use = 0; From 2b2d86b0d43dd46dce468de308899d488e91fb30 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 7 Apr 2017 09:34:16 +0200 Subject: [PATCH 052/555] scsi: sg: use standard lists for sg_requests commit 109bade9c625c89bb5ea753aaa1a0a97e6fbb548 upstream. 'Sg_request' is using a private list implementation; convert it to standard lists. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 147 +++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 86 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9aac35006ff7..53e5efcc7857 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -122,7 +122,7 @@ struct sg_device; /* forward declarations */ struct sg_fd; typedef struct sg_request { /* SG_MAX_QUEUE requests outstanding per file */ - struct sg_request *nextrp; /* NULL -> tail request (slist) */ + struct list_head entry; /* list entry */ struct sg_fd *parentfp; /* NULL -> not in use */ Sg_scatter_hold data; /* hold buffer, perhaps scatter list */ sg_io_hdr_t header; /* scsi command+info, see */ @@ -146,7 +146,7 @@ typedef struct sg_fd { /* holds the state of a file descriptor */ int timeout; /* defaults to SG_DEFAULT_TIMEOUT */ int timeout_user; /* defaults to SG_DEFAULT_TIMEOUT_USER */ Sg_scatter_hold reserve; /* buffer held for this file descriptor */ - Sg_request *headrp; /* head of request slist, NULL->empty */ + struct list_head rq_list; /* head of request list */ struct fasync_struct *async_qp; /* used by asynchronous notification */ Sg_request req_arr[SG_MAX_QUEUE]; /* used as singly-linked list */ char low_dma; /* as in parent but possibly overridden to 1 */ @@ -940,7 +940,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) if (!access_ok(VERIFY_WRITE, ip, sizeof (int))) return -EFAULT; read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (srp = sfp->headrp; srp; srp = srp->nextrp) { + list_for_each_entry(srp, &sfp->rq_list, entry) { if ((1 == srp->done) && (!srp->sg_io_owned)) { read_unlock_irqrestore(&sfp->rq_list_lock, iflags); @@ -953,7 +953,8 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return 0; case SG_GET_NUM_WAITING: read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (val = 0, srp = sfp->headrp; srp; srp = srp->nextrp) { + val = 0; + list_for_each_entry(srp, &sfp->rq_list, entry) { if ((1 == srp->done) && (!srp->sg_io_owned)) ++val; } @@ -1028,35 +1029,33 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (srp = sfp->headrp, val = 0; val < SG_MAX_QUEUE; - ++val, srp = srp ? srp->nextrp : srp) { + val = 0; + list_for_each_entry(srp, &sfp->rq_list, entry) { + if (val > SG_MAX_QUEUE) + break; memset(&rinfo[val], 0, SZ_SG_REQ_INFO); - if (srp) { - rinfo[val].req_state = srp->done + 1; - rinfo[val].problem = - srp->header.masked_status & - srp->header.host_status & - srp->header.driver_status; - if (srp->done) - rinfo[val].duration = - srp->header.duration; - else { - ms = jiffies_to_msecs(jiffies); - rinfo[val].duration = - (ms > srp->header.duration) ? - (ms - srp->header.duration) : 0; - } - rinfo[val].orphan = srp->orphan; - rinfo[val].sg_io_owned = - srp->sg_io_owned; - rinfo[val].pack_id = - srp->header.pack_id; - rinfo[val].usr_ptr = - srp->header.usr_ptr; + rinfo[val].req_state = srp->done + 1; + rinfo[val].problem = + srp->header.masked_status & + srp->header.host_status & + srp->header.driver_status; + if (srp->done) + rinfo[val].duration = + srp->header.duration; + else { + ms = jiffies_to_msecs(jiffies); + rinfo[val].duration = + (ms > srp->header.duration) ? + (ms - srp->header.duration) : 0; } + rinfo[val].orphan = srp->orphan; + rinfo[val].sg_io_owned = srp->sg_io_owned; + rinfo[val].pack_id = srp->header.pack_id; + rinfo[val].usr_ptr = srp->header.usr_ptr; + val++; } read_unlock_irqrestore(&sfp->rq_list_lock, iflags); - result = __copy_to_user(p, rinfo, + result = __copy_to_user(p, rinfo, SZ_SG_REQ_INFO * SG_MAX_QUEUE); result = result ? -EFAULT : 0; kfree(rinfo); @@ -1162,7 +1161,7 @@ sg_poll(struct file *filp, poll_table * wait) return POLLERR; poll_wait(filp, &sfp->read_wait, wait); read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (srp = sfp->headrp; srp; srp = srp->nextrp) { + list_for_each_entry(srp, &sfp->rq_list, entry) { /* if any read waiting, flag it */ if ((0 == res) && (1 == srp->done) && (!srp->sg_io_owned)) res = POLLIN | POLLRDNORM; @@ -2060,7 +2059,7 @@ sg_get_rq_mark(Sg_fd * sfp, int pack_id) unsigned long iflags; write_lock_irqsave(&sfp->rq_list_lock, iflags); - for (resp = sfp->headrp; resp; resp = resp->nextrp) { + list_for_each_entry(resp, &sfp->rq_list, entry) { /* look for requests that are ready + not SG_IO owned */ if ((1 == resp->done) && (!resp->sg_io_owned) && ((-1 == pack_id) || (resp->header.pack_id == pack_id))) { @@ -2078,70 +2077,45 @@ sg_add_request(Sg_fd * sfp) { int k; unsigned long iflags; - Sg_request *resp; Sg_request *rp = sfp->req_arr; write_lock_irqsave(&sfp->rq_list_lock, iflags); - resp = sfp->headrp; - if (!resp) { - memset(rp, 0, sizeof (Sg_request)); - rp->parentfp = sfp; - resp = rp; - sfp->headrp = resp; - } else { - if (0 == sfp->cmd_q) - resp = NULL; /* command queuing disallowed */ - else { - for (k = 0; k < SG_MAX_QUEUE; ++k, ++rp) { - if (!rp->parentfp) - break; - } - if (k < SG_MAX_QUEUE) { - memset(rp, 0, sizeof (Sg_request)); - rp->parentfp = sfp; - while (resp->nextrp) - resp = resp->nextrp; - resp->nextrp = rp; - resp = rp; - } else - resp = NULL; + if (!list_empty(&sfp->rq_list)) { + if (!sfp->cmd_q) + goto out_unlock; + + for (k = 0; k < SG_MAX_QUEUE; ++k, ++rp) { + if (!rp->parentfp) + break; } + if (k >= SG_MAX_QUEUE) + goto out_unlock; } - if (resp) { - resp->nextrp = NULL; - resp->header.duration = jiffies_to_msecs(jiffies); - } + memset(rp, 0, sizeof (Sg_request)); + rp->parentfp = sfp; + rp->header.duration = jiffies_to_msecs(jiffies); + list_add_tail(&rp->entry, &sfp->rq_list); write_unlock_irqrestore(&sfp->rq_list_lock, iflags); - return resp; + return rp; +out_unlock: + write_unlock_irqrestore(&sfp->rq_list_lock, iflags); + return NULL; } /* Return of 1 for found; 0 for not found */ static int sg_remove_request(Sg_fd * sfp, Sg_request * srp) { - Sg_request *prev_rp; - Sg_request *rp; unsigned long iflags; int res = 0; - if ((!sfp) || (!srp) || (!sfp->headrp)) + if (!sfp || !srp || list_empty(&sfp->rq_list)) return res; write_lock_irqsave(&sfp->rq_list_lock, iflags); - prev_rp = sfp->headrp; - if (srp == prev_rp) { - sfp->headrp = prev_rp->nextrp; - prev_rp->parentfp = NULL; + if (!list_empty(&srp->entry)) { + list_del(&srp->entry); + srp->parentfp = NULL; res = 1; - } else { - while ((rp = prev_rp->nextrp)) { - if (srp == rp) { - prev_rp->nextrp = rp->nextrp; - rp->parentfp = NULL; - res = 1; - break; - } - prev_rp = rp; - } } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); return res; @@ -2160,7 +2134,7 @@ sg_add_sfp(Sg_device * sdp) init_waitqueue_head(&sfp->read_wait); rwlock_init(&sfp->rq_list_lock); - + INIT_LIST_HEAD(&sfp->rq_list); kref_init(&sfp->f_ref); mutex_init(&sfp->f_mutex); sfp->timeout = SG_DEFAULT_TIMEOUT; @@ -2201,10 +2175,13 @@ sg_remove_sfp_usercontext(struct work_struct *work) { struct sg_fd *sfp = container_of(work, struct sg_fd, ew.work); struct sg_device *sdp = sfp->parentdp; + Sg_request *srp; /* Cleanup any responses which were never read(). */ - while (sfp->headrp) - sg_finish_rem_req(sfp->headrp); + while (!list_empty(&sfp->rq_list)) { + srp = list_first_entry(&sfp->rq_list, Sg_request, entry); + sg_finish_rem_req(srp); + } if (sfp->reserve.bufflen > 0) { SCSI_LOG_TIMEOUT(6, sg_printk(KERN_INFO, sdp, @@ -2607,7 +2584,7 @@ static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v) /* must be called while holding sg_index_lock */ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp) { - int k, m, new_interface, blen, usg; + int k, new_interface, blen, usg; Sg_request *srp; Sg_fd *fp; const sg_io_hdr_t *hp; @@ -2627,13 +2604,11 @@ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp) seq_printf(s, " cmd_q=%d f_packid=%d k_orphan=%d closed=0\n", (int) fp->cmd_q, (int) fp->force_packid, (int) fp->keep_orphan); - for (m = 0, srp = fp->headrp; - srp != NULL; - ++m, srp = srp->nextrp) { + list_for_each_entry(srp, &fp->rq_list, entry) { hp = &srp->header; new_interface = (hp->interface_id == '\0') ? 0 : 1; if (srp->res_used) { - if (new_interface && + if (new_interface && (SG_FLAG_MMAP_IO & hp->flags)) cp = " mmap>> "; else @@ -2664,7 +2639,7 @@ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp) seq_printf(s, "ms sgat=%d op=0x%02x\n", usg, (int) srp->data.cmd_opcode); } - if (0 == m) + if (list_empty(&fp->rq_list)) seq_puts(s, " No requests active\n"); read_unlock(&fp->rq_list_lock); } From c6b9a2007c92cc0bcf2b947ed369fffe1652197e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 17 Aug 2017 10:09:54 +0300 Subject: [PATCH 053/555] scsi: sg: off by one in sg_ioctl() commit bd46fc406b30d1db1aff8dabaff8d18bb423fdcf upstream. If "val" is SG_MAX_QUEUE then we are one element beyond the end of the "rinfo" array so the > should be >=. Fixes: 109bade9c625 ("scsi: sg: use standard lists for sg_requests") Signed-off-by: Dan Carpenter Acked-by: Douglas Gilbert Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 53e5efcc7857..b7181f390876 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1031,7 +1031,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) read_lock_irqsave(&sfp->rq_list_lock, iflags); val = 0; list_for_each_entry(srp, &sfp->rq_list, entry) { - if (val > SG_MAX_QUEUE) + if (val >= SG_MAX_QUEUE) break; memset(&rinfo[val], 0, SZ_SG_REQ_INFO); rinfo[val].req_state = srp->done + 1; From 25d5a8a2958f0eb708c36179f6728859a18e60f3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 15 Sep 2017 14:05:15 +0200 Subject: [PATCH 054/555] scsi: sg: factor out sg_fill_request_table() commit 4759df905a474d245752c9dc94288e779b8734dd upstream. Factor out sg_fill_request_table() for better readability. [mkp: typos, applied by hand] Signed-off-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 61 +++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index b7181f390876..1807f22a32cc 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -828,6 +828,40 @@ static int max_sectors_bytes(struct request_queue *q) return max_sectors << 9; } +static void +sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) +{ + Sg_request *srp; + int val; + unsigned int ms; + + val = 0; + list_for_each_entry(srp, &sfp->rq_list, entry) { + if (val > SG_MAX_QUEUE) + break; + memset(&rinfo[val], 0, SZ_SG_REQ_INFO); + rinfo[val].req_state = srp->done + 1; + rinfo[val].problem = + srp->header.masked_status & + srp->header.host_status & + srp->header.driver_status; + if (srp->done) + rinfo[val].duration = + srp->header.duration; + else { + ms = jiffies_to_msecs(jiffies); + rinfo[val].duration = + (ms > srp->header.duration) ? + (ms - srp->header.duration) : 0; + } + rinfo[val].orphan = srp->orphan; + rinfo[val].sg_io_owned = srp->sg_io_owned; + rinfo[val].pack_id = srp->header.pack_id; + rinfo[val].usr_ptr = srp->header.usr_ptr; + val++; + } +} + static long sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) { @@ -1022,38 +1056,13 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return -EFAULT; else { sg_req_info_t *rinfo; - unsigned int ms; rinfo = kmalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); - val = 0; - list_for_each_entry(srp, &sfp->rq_list, entry) { - if (val >= SG_MAX_QUEUE) - break; - memset(&rinfo[val], 0, SZ_SG_REQ_INFO); - rinfo[val].req_state = srp->done + 1; - rinfo[val].problem = - srp->header.masked_status & - srp->header.host_status & - srp->header.driver_status; - if (srp->done) - rinfo[val].duration = - srp->header.duration; - else { - ms = jiffies_to_msecs(jiffies); - rinfo[val].duration = - (ms > srp->header.duration) ? - (ms - srp->header.duration) : 0; - } - rinfo[val].orphan = srp->orphan; - rinfo[val].sg_io_owned = srp->sg_io_owned; - rinfo[val].pack_id = srp->header.pack_id; - rinfo[val].usr_ptr = srp->header.usr_ptr; - val++; - } + sg_fill_request_table(sfp, rinfo); read_unlock_irqrestore(&sfp->rq_list_lock, iflags); result = __copy_to_user(p, rinfo, SZ_SG_REQ_INFO * SG_MAX_QUEUE); From 90cb12f6dc5ac45c51082721ec5bbe18850cf80f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 15 Sep 2017 14:05:16 +0200 Subject: [PATCH 055/555] scsi: sg: fixup infoleak when using SG_GET_REQUEST_TABLE commit 3e0097499839e0fe3af380410eababe5a47c4cf9 upstream. When calling SG_GET_REQUEST_TABLE ioctl only a half-filled table is returned; the remaining part will then contain stale kernel memory information. This patch zeroes out the entire table to avoid this issue. Signed-off-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Eric Dumazet Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 1807f22a32cc..02dfbc1373e3 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -839,7 +839,6 @@ sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) list_for_each_entry(srp, &sfp->rq_list, entry) { if (val > SG_MAX_QUEUE) break; - memset(&rinfo[val], 0, SZ_SG_REQ_INFO); rinfo[val].req_state = srp->done + 1; rinfo[val].problem = srp->header.masked_status & @@ -1057,8 +1056,8 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) else { sg_req_info_t *rinfo; - rinfo = kmalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, - GFP_KERNEL); + rinfo = kzalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, + GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); From 6e2a0259da7a6c772d5bd2e7475616055a39192d Mon Sep 17 00:00:00 2001 From: Joe Carnuccio Date: Wed, 23 Aug 2017 15:04:55 -0700 Subject: [PATCH 056/555] scsi: qla2xxx: Correction to vha->vref_count timeout commit 6e98095f8fb6d98da34c4e6c34e69e7c638d79c0 upstream. Fix incorrect second argument for wait_event_timeout() Fixes: c4a9b538ab2a ("qla2xxx: Allow vref count to timeout on vport delete.") Signed-off-by: Joe Carnuccio Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_mid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index 3dfb54abc874..f8ae70476b3a 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -74,7 +74,7 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha) * ensures no active vp_list traversal while the vport is removed * from the queue) */ - wait_event_timeout(vha->vref_waitq, atomic_read(&vha->vref_count), + wait_event_timeout(vha->vref_waitq, !atomic_read(&vha->vref_count), 10*HZ); spin_lock_irqsave(&ha->vport_slock, flags); From 2a913aecc4f746ce15eb1bec98b134aff4190ae2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 30 Aug 2017 16:30:35 +0300 Subject: [PATCH 057/555] scsi: qla2xxx: Fix an integer overflow in sysfs code commit e6f77540c067b48dee10f1e33678415bfcc89017 upstream. The value of "size" comes from the user. When we add "start + size" it could lead to an integer overflow bug. It means we vmalloc() a lot more memory than we had intended. I believe that on 64 bit systems vmalloc() can succeed even if we ask it to allocate huge 4GB buffers. So we would get memory corruption and likely a crash when we call ha->isp_ops->write_optrom() and ->read_optrom(). Only root can trigger this bug. Link: https://bugzilla.kernel.org/show_bug.cgi?id=194061 Fixes: b7cc176c9eb3 ("[SCSI] qla2xxx: Allow region-based flash-part accesses.") Reported-by: shqking Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_attr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 8c4641b518b5..9a34afcb1c4c 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -318,6 +318,8 @@ qla2x00_sysfs_write_optrom_ctl(struct file *filp, struct kobject *kobj, return -EINVAL; if (start > ha->optrom_size) return -EINVAL; + if (size > ha->optrom_size - start) + size = ha->optrom_size - start; mutex_lock(&ha->optrom_mutex); switch (val) { @@ -343,8 +345,7 @@ qla2x00_sysfs_write_optrom_ctl(struct file *filp, struct kobject *kobj, } ha->optrom_region_start = start; - ha->optrom_region_size = start + size > ha->optrom_size ? - ha->optrom_size - start : size; + ha->optrom_region_size = start + size; ha->optrom_state = QLA_SREADING; ha->optrom_buffer = vmalloc(ha->optrom_region_size); @@ -417,8 +418,7 @@ qla2x00_sysfs_write_optrom_ctl(struct file *filp, struct kobject *kobj, } ha->optrom_region_start = start; - ha->optrom_region_size = start + size > ha->optrom_size ? - ha->optrom_size - start : size; + ha->optrom_region_size = start + size; ha->optrom_state = QLA_SWRITING; ha->optrom_buffer = vmalloc(ha->optrom_region_size); From df865f86b008c6b7ef592e8264f8eaabe371505b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 1 Sep 2017 12:04:09 -0400 Subject: [PATCH 058/555] ftrace: Fix selftest goto location on error commit 46320a6acc4fb58f04bcf78c4c942cc43b20f986 upstream. In the second iteration of trace_selftest_ops(), the error goto label is wrong in the case where trace_selftest_test_global_cnt is off. In the case of error, it leaks the dynamic ops that was allocated. Fixes: 95950c2e ("ftrace: Add self-tests for multiple function trace users") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b0f86ea77881..ca70d11b8aa7 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -272,7 +272,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) goto out_free; if (cnt > 1) { if (trace_selftest_test_global_cnt == 0) - goto out; + goto out_free; } if (trace_selftest_test_dyn_cnt == 0) goto out_free; From 100553e197e2c41eccf9fa04b2be9cd11ae21215 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 1 Sep 2017 12:18:28 -0400 Subject: [PATCH 059/555] ftrace: Fix memleak when unregistering dynamic ops when tracing disabled commit edb096e00724f02db5f6ec7900f3bbd465c6c76f upstream. If function tracing is disabled by the user via the function-trace option or the proc sysctl file, and a ftrace_ops that was allocated on the heap is unregistered, then the shutdown code exits out without doing the proper clean up. This was found via kmemleak and running the ftrace selftests, as one of the tests unregisters with function tracing disabled. # cat kmemleak unreferenced object 0xffffffffa0020000 (size 4096): comm "swapper/0", pid 1, jiffies 4294668889 (age 569.209s) hex dump (first 32 bytes): 55 ff 74 24 10 55 48 89 e5 ff 74 24 18 55 48 89 U.t$.UH...t$.UH. e5 48 81 ec a8 00 00 00 48 89 44 24 50 48 89 4c .H......H.D$PH.L backtrace: [] kmemleak_vmalloc+0x85/0xf0 [] __vmalloc_node_range+0x281/0x3e0 [] module_alloc+0x4f/0x90 [] arch_ftrace_update_trampoline+0x160/0x420 [] ftrace_startup+0xe7/0x300 [] register_ftrace_function+0x72/0x90 [] trace_selftest_ops+0x204/0x397 [] trace_selftest_startup_function+0x394/0x624 [] run_tracer_selftest+0x15c/0x1d7 [] init_trace_selftests+0x75/0x192 [] do_one_initcall+0x90/0x1e2 [] kernel_init_freeable+0x350/0x3fe [] kernel_init+0x13/0x122 [] ret_from_fork+0x2a/0x40 [] 0xffffffffffffffff Fixes: 12cce594fa ("ftrace/x86: Allow !CONFIG_PREEMPT dynamic ops to use allocated trampolines") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ftrace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6e432ed7d0fe..53ed8ae5de1c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2747,13 +2747,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are per_cpu ops, they still need their - * per_cpu field freed. Since, function tracing is + * If these are dynamic or per_cpu ops, they still + * need their data freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_PER_CPU) - per_cpu_ops_free(ops); + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) + goto free_ops; + return 0; } @@ -2808,6 +2809,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { schedule_on_each_cpu(ftrace_sync); + free_ops: arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) From 96cf918df428c16986cf88b3ebac465e04c3f5f6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 5 Sep 2017 11:32:01 -0400 Subject: [PATCH 060/555] tracing: Add barrier to trace_printk() buffer nesting modification commit 3d9622c12c8873911f4cc0ccdabd0362c2fca06b upstream. trace_printk() uses 4 buffers, one for each context (normal, softirq, irq and NMI), such that it does not need to worry about one context preempting the other. There's a nesting counter that gets incremented to figure out which buffer to use. If the context gets preempted by another context which calls trace_printk() it will increment the counter and use the next buffer, and restore the counter when it is finished. The problem is that gcc may optimize the modification of the buffer nesting counter and it may not be incremented in memory before the buffer is used. If this happens, and the context gets interrupted by another context, it could pick the same buffer and corrupt the one that is being used. Compiler barriers need to be added after the nesting variable is incremented and before it is decremented to prevent usage of the context buffers by more than one context at the same time. Cc: Andy Lutomirski Fixes: e2ace00117 ("tracing: Choose static tp_printk buffer by explicit nesting count") Hat-tip-to: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7379f735a9f4..2508c319ae5f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2369,11 +2369,17 @@ static char *get_trace_buf(void) if (!buffer || buffer->nesting >= 4) return NULL; - return &buffer->buffer[buffer->nesting++][0]; + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting][0]; } static void put_trace_buf(void) { + /* Don't let the decrement of nesting leak before this */ + barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } From cf052336d0d3f360df30a0eedc5ec45c5b2b48d4 Mon Sep 17 00:00:00 2001 From: Baohong Liu Date: Tue, 5 Sep 2017 16:57:19 -0500 Subject: [PATCH 061/555] tracing: Apply trace_clock changes to instance max buffer commit 170b3b1050e28d1ba0700e262f0899ffa4fccc52 upstream. Currently trace_clock timestamps are applied to both regular and max buffers only for global trace. For instance trace, trace_clock timestamps are applied only to regular buffer. But, regular and max buffers can be swapped, for example, following a snapshot. So, for instance trace, bad timestamps can be seen following a snapshot. Let's apply trace_clock timestamps to instance max buffer as well. Link: http://lkml.kernel.org/r/ebdb168d0be042dcdf51f81e696b17fabe3609c1.1504642143.git.tom.zanussi@linux.intel.com Fixes: 277ba0446 ("tracing: Add interface to allow multiple trace buffers") Signed-off-by: Baohong Liu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2508c319ae5f..f95bf81529f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5664,7 +5664,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif From 57e4f87ebe4682a1f5a78f0c2ffe49e874bd49df Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Fri, 1 Sep 2017 17:00:23 +0100 Subject: [PATCH 062/555] ARC: Re-enable MMU upon Machine Check exception commit 1ee55a8f7f6b7ca4c0c59e0b4b4e3584a085c2d3 upstream. I recently came upon a scenario where I would get a double fault machine check exception tiriggered by a kernel module. However the ensuing crash stacktrace (ksym lookup) was not working correctly. Turns out that machine check auto-disables MMU while modules are allocated in kernel vaddr spapce. This patch re-enables the MMU before start printing the stacktrace making stacktracing of modules work upon a fatal exception. Signed-off-by: Jose Abreu Reviewed-by: Alexey Brodkin Signed-off-by: Vineet Gupta [vgupta: moved code into low level handler to avoid in 2 places] Signed-off-by: Greg Kroah-Hartman --- arch/arc/kernel/entry.S | 6 ++++++ arch/arc/mm/tlb.c | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index 1eea99beecc3..85d9ea4a0acc 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -92,6 +92,12 @@ ENTRY(EV_MachineCheck) lr r0, [efa] mov r1, sp + ; hardware auto-disables MMU, re-enable it to allow kernel vaddr + ; access for say stack unwinding of modules for crash dumps + lr r3, [ARC_REG_PID] + or r3, r3, MMU_ENABLE + sr r3, [ARC_REG_PID] + lsr r3, r2, 8 bmsk r3, r3, 7 brne r3, ECR_C_MCHK_DUP_TLB, 1f diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index bdb295e09160..a4dc881da277 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -896,9 +896,6 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address, local_irq_save(flags); - /* re-enable the MMU */ - write_aux_reg(ARC_REG_PID, MMU_ENABLE | read_aux_reg(ARC_REG_PID)); - /* loop thru all sets of TLB */ for (set = 0; set < mmu->sets; set++) { From 998a9f51bc74933713b6308b676b941c7b1ce3aa Mon Sep 17 00:00:00 2001 From: Aleksandr Bezzubikov Date: Tue, 18 Jul 2017 17:12:25 +0300 Subject: [PATCH 063/555] PCI: shpchp: Enable bridge bus mastering if MSI is enabled commit 48b79a14505349a29b3e20f03619ada9b33c4b17 upstream. An SHPC may generate MSIs to notify software about slot or controller events (SHPC spec r1.0, sec 4.7). A PCI device can only generate an MSI if it has bus mastering enabled. Enable bus mastering if the bridge contains an SHPC that uses MSI for event notifications. Signed-off-by: Aleksandr Bezzubikov [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas Reviewed-by: Marcel Apfelbaum Acked-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/pci/hotplug/shpchp_hpc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/hotplug/shpchp_hpc.c b/drivers/pci/hotplug/shpchp_hpc.c index de0ea474fb73..e5824c7b7b6b 100644 --- a/drivers/pci/hotplug/shpchp_hpc.c +++ b/drivers/pci/hotplug/shpchp_hpc.c @@ -1062,6 +1062,8 @@ int shpc_init(struct controller *ctrl, struct pci_dev *pdev) if (rc) { ctrl_info(ctrl, "Can't get msi for the hotplug controller\n"); ctrl_info(ctrl, "Use INTx for the hotplug controller\n"); + } else { + pci_set_master(pdev); } rc = request_irq(ctrl->pci_dev->irq, shpc_isr, IRQF_SHARED, From 2fd62929c88f4cd0e4a8d920b509504249ff67aa Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 1 Aug 2017 03:11:52 -0400 Subject: [PATCH 064/555] PCI: pciehp: Report power fault only once until we clear it commit 7612b3b28c0b900dcbcdf5e9b9747cc20a1e2455 upstream. When a power fault occurs, the power controller sets Power Fault Detected in the Slot Status register, and pciehp_isr() queues an INT_POWER_FAULT event to handle it. It also clears Power Fault Detected, but since nothing has yet changed to correct the power fault, the power controller will likely set it again immediately, which may cause an infinite loop when pcie_isr() rechecks Slot Status. Fix that by masking off Power Fault Detected from new events if the driver hasn't seen the power fault clear from the previous handling attempt. Fixes: fad214b0aa72 ("PCI: pciehp: Process all hotplug events before looking for new ones") Signed-off-by: Keith Busch [bhelgaas: changelog, pull test out and add comment] Signed-off-by: Bjorn Helgaas Cc: Mayurkumar Patel Signed-off-by: Greg Kroah-Hartman --- drivers/pci/hotplug/pciehp_hpc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index b57fc6d6e28a..d08dfc8b9ba9 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -586,6 +586,14 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) events = status & (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD | PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC | PCI_EXP_SLTSTA_DLLSC); + + /* + * If we've already reported a power fault, don't report it again + * until we've done something to handle it. + */ + if (ctrl->power_fault_detected) + events &= ~PCI_EXP_SLTSTA_PFD; + if (!events) return IRQ_NONE; From 4c7f54a0f9778abde87f0a96364c99bbf89e5786 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Thu, 6 Jul 2017 20:45:59 +0200 Subject: [PATCH 065/555] net/netfilter/nf_conntrack_core: Fix net_conntrack_lock() commit 3ef0c7a730de0bae03d86c19570af764fa3c4445 upstream. As we want to remove spin_unlock_wait() and replace it with explicit spin_lock()/spin_unlock() calls, we can use this to simplify the locking. In addition: - Reading nf_conntrack_locks_all needs ACQUIRE memory ordering. - The new code avoids the backwards loop. Only slightly tested, I did not manage to trigger calls to nf_conntrack_all_lock(). V2: With improved comments, to clearly show how the barriers pair. Fixes: b16c29191dc8 ("netfilter: nf_conntrack: use safer way to lock all buckets") Signed-off-by: Manfred Spraul Cc: Alan Stern Cc: Sasha Levin Cc: Pablo Neira Ayuso Cc: netfilter-devel@vger.kernel.org Signed-off-by: Paul E. McKenney Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_core.c | 52 +++++++++++++++++-------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6bd150882ba4..ed9ce7c63252 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -95,19 +95,26 @@ static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { + /* 1) Acquire the lock */ spin_lock(lock); - while (unlikely(nf_conntrack_locks_all)) { - spin_unlock(lock); - /* - * Order the 'nf_conntrack_locks_all' load vs. the - * spin_unlock_wait() loads below, to ensure - * that 'nf_conntrack_locks_all_lock' is indeed held: - */ - smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ - spin_unlock_wait(&nf_conntrack_locks_all_lock); - spin_lock(lock); - } + /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics + * It pairs with the smp_store_release() in nf_conntrack_all_unlock() + */ + if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) + return; + + /* fast path failed, unlock */ + spin_unlock(lock); + + /* Slow path 1) get global lock */ + spin_lock(&nf_conntrack_locks_all_lock); + + /* Slow path 2) get the lock we want */ + spin_lock(lock); + + /* Slow path 3) release the global lock */ + spin_unlock(&nf_conntrack_locks_all_lock); } EXPORT_SYMBOL_GPL(nf_conntrack_lock); @@ -148,28 +155,27 @@ static void nf_conntrack_all_lock(void) int i; spin_lock(&nf_conntrack_locks_all_lock); + nf_conntrack_locks_all = true; - /* - * Order the above store of 'nf_conntrack_locks_all' against - * the spin_unlock_wait() loads below, such that if - * nf_conntrack_lock() observes 'nf_conntrack_locks_all' - * we must observe nf_conntrack_locks[] held: - */ - smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ - for (i = 0; i < CONNTRACK_LOCKS; i++) { - spin_unlock_wait(&nf_conntrack_locks[i]); + spin_lock(&nf_conntrack_locks[i]); + + /* This spin_unlock provides the "release" to ensure that + * nf_conntrack_locks_all==true is visible to everyone that + * acquired spin_lock(&nf_conntrack_locks[]). + */ + spin_unlock(&nf_conntrack_locks[i]); } } static void nf_conntrack_all_unlock(void) { - /* - * All prior stores must be complete before we clear + /* All prior stores must be complete before we clear * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() * might observe the false value but not the entire - * critical section: + * critical section. + * It pairs with the smp_load_acquire() in nf_conntrack_lock() */ smp_store_release(&nf_conntrack_locks_all, false); spin_unlock(&nf_conntrack_locks_all_lock); From 536ab630f4db25fdfe7c46c027686ecad12150e7 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 16 Aug 2017 14:10:01 +0200 Subject: [PATCH 066/555] s390/mm: fix local TLB flushing vs. detach of an mm address space commit b3e5dc45fd1ec2aa1de6b80008f9295eb17e0659 upstream. The local TLB flushing code keeps an additional mask in the mm.context, the cpu_attach_mask. At the time a global flush of an address space is done the cpu_attach_mask is copied to the mm_cpumask in order to avoid future global flushes in case the mm is used by a single CPU only after the flush. Trouble is that the reset of the mm_cpumask is racy against the detach of an mm address space by switch_mm. The current order is first the global TLB flush and then the copy of the cpu_attach_mask to the mm_cpumask. The order needs to be the other way around. Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/mmu_context.h | 4 ++-- arch/s390/include/asm/tlbflush.h | 26 +++++--------------------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 515fea5a3fc4..dd359859a3e4 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -93,7 +93,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (prev == next) return; cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); - cpumask_set_cpu(cpu, mm_cpumask(next)); /* Clear old ASCE by loading the kernel ASCE. */ __ctl_load(S390_lowcore.kernel_asce, 1, 1); __ctl_load(S390_lowcore.kernel_asce, 7, 7); @@ -111,7 +110,7 @@ static inline void finish_arch_post_lock_switch(void) preempt_disable(); while (atomic_read(&mm->context.flush_count)) cpu_relax(); - + cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); if (mm->context.flush_mm) __tlb_flush_mm(mm); preempt_enable(); @@ -126,6 +125,7 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { switch_mm(prev, next, current); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); set_user_asce(next); } diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 39846100682a..043c2aab6622 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -43,23 +43,6 @@ static inline void __tlb_flush_global(void) * Flush TLB entries for a specific mm on all CPUs (in case gmap is used * this implicates multiple ASCEs!). */ -static inline void __tlb_flush_full(struct mm_struct *mm) -{ - preempt_disable(); - atomic_inc(&mm->context.flush_count); - if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { - /* Local TLB flush */ - __tlb_flush_local(); - } else { - /* Global TLB flush */ - __tlb_flush_global(); - /* Reset TLB flush mask */ - cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); - } - atomic_dec(&mm->context.flush_count); - preempt_enable(); -} - static inline void __tlb_flush_mm(struct mm_struct *mm) { unsigned long gmap_asce; @@ -71,16 +54,18 @@ static inline void __tlb_flush_mm(struct mm_struct *mm) */ preempt_disable(); atomic_inc(&mm->context.flush_count); + /* Reset TLB flush mask */ + cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); + barrier(); gmap_asce = READ_ONCE(mm->context.gmap_asce); if (MACHINE_HAS_IDTE && gmap_asce != -1UL) { if (gmap_asce) __tlb_flush_idte(gmap_asce); __tlb_flush_idte(mm->context.asce); } else { - __tlb_flush_full(mm); + /* Global TLB flush */ + __tlb_flush_global(); } - /* Reset TLB flush mask */ - cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); atomic_dec(&mm->context.flush_count); preempt_enable(); } @@ -94,7 +79,6 @@ static inline void __tlb_flush_kernel(void) } #else #define __tlb_flush_global() __tlb_flush_local() -#define __tlb_flush_full(mm) __tlb_flush_local() /* * Flush TLB entries for a specific ASCE on all CPUs. From de4360dd3519629fc0aabe6e87843038b84d75e3 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 17 Aug 2017 08:15:16 +0200 Subject: [PATCH 067/555] s390/mm: fix race on mm->context.flush_mm commit 60f07c8ec5fae06c23e9fd7bab67dabce92b3414 upstream. The order in __tlb_flush_mm_lazy is to flush TLB first and then clear the mm->context.flush_mm bit. This can lead to missed flushes as the bit can be set anytime, the order needs to be the other way aronud. But this leads to a different race, __tlb_flush_mm_lazy may be called on two CPUs concurrently. If mm->context.flush_mm is cleared first then another CPU can bypass __tlb_flush_mm_lazy although the first CPU has not done the flush yet. In a virtualized environment the time until the flush is finally completed can be arbitrarily long. Add a spinlock to serialize __tlb_flush_mm_lazy and use the function in finish_arch_post_lock_switch as well. Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/mmu.h | 2 ++ arch/s390/include/asm/mmu_context.h | 4 ++-- arch/s390/include/asm/tlbflush.h | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bea785d7f853..af85d6b12028 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -5,6 +5,7 @@ #include typedef struct { + spinlock_t lock; cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; @@ -25,6 +26,7 @@ typedef struct { } mm_context_t; #define INIT_MM_CONTEXT(name) \ + .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ .context.pgtable_lock = \ __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index dd359859a3e4..f65a708ac395 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -15,6 +15,7 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + spin_lock_init(&mm->context.lock); spin_lock_init(&mm->context.pgtable_lock); INIT_LIST_HEAD(&mm->context.pgtable_list); spin_lock_init(&mm->context.gmap_lock); @@ -111,8 +112,7 @@ static inline void finish_arch_post_lock_switch(void) while (atomic_read(&mm->context.flush_count)) cpu_relax(); cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); - if (mm->context.flush_mm) - __tlb_flush_mm(mm); + __tlb_flush_mm_lazy(mm); preempt_enable(); } set_fs(current->thread.mm_segment); diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 043c2aab6622..eed927aeb08f 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -96,10 +96,12 @@ static inline void __tlb_flush_kernel(void) static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) { + spin_lock(&mm->context.lock); if (mm->context.flush_mm) { - __tlb_flush_mm(mm); mm->context.flush_mm = 0; + __tlb_flush_mm(mm); } + spin_unlock(&mm->context.lock); } /* From 7717a7378c53a510872db75d65879dc8eaca24b5 Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Wed, 2 Aug 2017 23:42:17 -0400 Subject: [PATCH 068/555] media: v4l2-compat-ioctl32: Fix timespec conversion commit 9c7ba1d7634cef490b85bc64c4091ff004821bfd upstream. Certain syscalls like recvmmsg support 64 bit timespec values for the X32 ABI. The helper function compat_put_timespec converts a timespec value to a 32 bit or 64 bit value depending on what ABI is used. The v4l2 compat layer, however, is not designed to support 64 bit timespec values and always uses 32 bit values. Hence, compat_put_timespec must not be used. Without this patch, user space will be provided with bad timestamp values from the VIDIOC_DQEVENT ioctl. Also, fields of the struct v4l2_event32 that come immediately after timestamp get overwritten, namely the field named id. Fixes: 81993e81a994 ("compat: Get rid of (get|put)_compat_time(val|spec)") Cc: H. Peter Anvin Cc: Laurent Pinchart Cc: Tiffany Lin Cc: Ricardo Ribalda Delgado Cc: Sakari Ailus Signed-off-by: Daniel Mentz Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index bacecbd68a6d..dc51dd86377d 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -773,7 +773,8 @@ static int put_v4l2_event32(struct v4l2_event *kp, struct v4l2_event32 __user *u copy_to_user(&up->u, &kp->u, sizeof(kp->u)) || put_user(kp->pending, &up->pending) || put_user(kp->sequence, &up->sequence) || - compat_put_timespec(&kp->timestamp, &up->timestamp) || + put_user(kp->timestamp.tv_sec, &up->timestamp.tv_sec) || + put_user(kp->timestamp.tv_nsec, &up->timestamp.tv_nsec) || put_user(kp->id, &up->id) || copy_to_user(up->reserved, kp->reserved, 8 * sizeof(__u32))) return -EFAULT; From 38993f320506d7ead26695218ba1481f250469d5 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 8 Aug 2017 08:56:21 -0400 Subject: [PATCH 069/555] media: uvcvideo: Prevent heap overflow when accessing mapped controls commit 7e09f7d5c790278ab98e5f2c22307ebe8ad6e8ba upstream. The size of uvc_control_mapping is user controlled leading to a potential heap overflow in the uvc driver. This adds a check to verify the user provided size fits within the bounds of the defined buffer size. Originally-from: Richard Simmons Signed-off-by: Guenter Roeck Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/uvc/uvc_ctrl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c index c2ee6e39fd0c..20397aba6849 100644 --- a/drivers/media/usb/uvc/uvc_ctrl.c +++ b/drivers/media/usb/uvc/uvc_ctrl.c @@ -2002,6 +2002,13 @@ int uvc_ctrl_add_mapping(struct uvc_video_chain *chain, goto done; } + /* Validate the user-provided bit-size and offset */ + if (mapping->size > 32 || + mapping->offset + mapping->size > ctrl->info.size * 8) { + ret = -EINVAL; + goto done; + } + list_for_each_entry(map, &ctrl->info.mappings, list) { if (mapping->id == map->id) { uvc_trace(UVC_TRACE_CONTROL, "Can't add mapping '%s', " From f5c3fd83284fd8add9308281fc9cfc8afc5e8d97 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Thu, 24 Aug 2017 10:42:48 +0900 Subject: [PATCH 070/555] PM / devfreq: Fix memory leak when fail to register device commit 9e14de1077e9c34f141cf98bdba60cdd5193d962 upstream. When the devfreq_add_device fails to register deivce, the memory leak of devfreq instance happen. So, this patch fix the memory leak issue. Before freeing the devfreq instance checks whether devfreq instance is NULL or not because the device_unregister() frees the devfreq instance when jumping to the 'err_init'. It is to prevent the duplicate the kfee(devfreq). Fixes: ac4b281176a5 ("PM / devfreq: fix duplicated kfree on devfreq pointer") Signed-off-by: Chanwoo Choi Signed-off-by: MyungJoo Ham Signed-off-by: Greg Kroah-Hartman --- drivers/devfreq/devfreq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 7309c0824887..a2449d77af07 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -574,7 +574,7 @@ struct devfreq *devfreq_add_device(struct device *dev, err = device_register(&devfreq->dev); if (err) { mutex_unlock(&devfreq->lock); - goto err_out; + goto err_dev; } devfreq->trans_table = devm_kzalloc(&devfreq->dev, sizeof(unsigned int) * @@ -618,6 +618,9 @@ err_init: mutex_unlock(&devfreq_list_lock); device_unregister(&devfreq->dev); +err_dev: + if (devfreq) + kfree(devfreq); err_out: return ERR_PTR(err); } From 2a9b55742a9fe838ddc86f8b5c75a56867ea1912 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Thu, 7 Sep 2017 01:28:53 +0800 Subject: [PATCH 071/555] bcache: initialize dirty stripes in flash_dev_run() commit 175206cf9ab63161dec74d9cd7f9992e062491f5 upstream. bcache uses a Proportion-Differentiation Controller algorithm to control writeback rate to cached devices. In the PD controller algorithm, dirty stripes of thin flash device should not be counted in, because flash only volumes never write back dirty data. Currently dirty stripe counter for thin flash device is not initialized when the thin flash device starts. Which means the following calculation in PD controller will reference an undefined dirty stripes number, and all cached devices attached to the same cache set where the thin flash device lies on may have an inaccurate writeback rate. This patch calles bch_sectors_dirty_init() in flash_dev_run(), to correctly initialize dirty stripe counter when the thin flash device starts to run. This patch also does following parameter data type change, -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); to call this function conveniently in flash_dev_run(). (Commit log is composed by Coly Li) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/super.c | 3 ++- drivers/md/bcache/writeback.c | 8 ++++---- drivers/md/bcache/writeback.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 66669c8f4161..e21df091eac0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1025,7 +1025,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) } if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(dc); + bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); atomic_inc(&dc->count); bch_writeback_queue(dc); @@ -1229,6 +1229,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err; bcache_device_attach(d, c, u - c->uuids); + bch_sectors_dirty_init(d); bch_flash_dev_request_init(d); add_disk(d->disk); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e51644e503a5..04073a7cd3cf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -482,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct cached_dev *dc) +void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; bch_btree_op_init(&op.op, -1); - op.inode = dc->disk.id; + op.inode = d->id; - bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); + d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 301eaf565167..e09425151932 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -85,7 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); void bch_cached_dev_writeback_init(struct cached_dev *); int bch_cached_dev_writeback_start(struct cached_dev *); From c234e0e77572d78cbac0de7acee2263343595c25 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 14:25:51 +0800 Subject: [PATCH 072/555] bcache: Fix leak of bdev reference commit 4b758df21ee7081ab41448d21d60367efaa625b3 upstream. If blkdev_get_by_path() in register_bcache() fails, we try to lookup the block device using lookup_bdev() to detect which situation we are in to properly report error. However we never drop the reference returned to us from lookup_bdev(). Fix that. Signed-off-by: Jan Kara Acked-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e21df091eac0..9997127170ea 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1968,6 +1968,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (!IS_ERR(bdev)) + bdput(bdev); if (attr == &ksysfs_register_quiet) goto out; } From 8f51f38883dcff93f55e132d3d8b1c991e809474 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:53 +0800 Subject: [PATCH 073/555] bcache: do not subtract sectors_to_gc for bypassed IO commit 69daf03adef5f7bc13e0ac86b4b8007df1767aab upstream. Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to trigger gc thread. Signed-off-by: tang.junhui Acked-by: Coly Li Reviewed-by: Eric Wheeler Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index a37c1776f2e3..e0f1c6d534fe 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) - wake_up_gc(op->c); - if (op->bypass) return bch_data_invalidate(cl); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) + wake_up_gc(op->c); + /* * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. From e40cb30162d7f9e9ea8aae9dd1b93e1ff30c1bcd Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:56 +0800 Subject: [PATCH 074/555] bcache: correct cache_dirty_target in __update_writeback_rate() commit a8394090a9129b40f9d90dcb7f4a49d60c727ca6 upstream. __update_write_rate() uses a Proportion-Differentiation Controller algorithm to control writeback rate. A dirty target number is used in this PD controller to control writeback rate. A larger target number will make the writeback rate smaller, on the versus, a smaller target number will make the writeback rate larger. bcache uses the following steps to calculate the target number, 1) cache_sectors = all-buckets-of-cache-set * buckets-size 2) cache_dirty_target = cache_sectors * cached-device-writeback_percent 3) target = cache_dirty_target * (sectors-of-cached-device/sectors-of-all-cached-devices-of-this-cache-set) The calculation at step 1) for cache_sectors is incorrect, which does not consider dirty blocks occupied by flash only volume. A flash only volume can be took as a bcache device without cached device. All data sectors allocated for it are persistent on cache device and marked dirty, they are not touched by bcache writeback and garbage collection code. So data blocks of flash only volume should be ignore when calculating cache_sectors of cache set. Current code does not subtract dirty sectors of flash only volume, which results a larger target number from the above 3 steps. And in sequence the cache device's writeback rate is smaller then a correct value, writeback speed is slower on all cached devices. This patch fixes the incorrect slower writeback rate by subtracting dirty sectors of flash only volumes in __update_writeback_rate(). (Commit log composed by Coly Li to pass checkpatch.pl checking) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/writeback.c | 3 ++- drivers/md/bcache/writeback.h | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 04073a7cd3cf..8f7bd4755a85 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -20,7 +20,8 @@ static void __update_writeback_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e09425151932..cdf8d253209e 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) +{ + uint64_t i, ret = 0; + + mutex_lock(&bch_register_lock); + + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + + if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) + continue; + ret += bcache_dev_sectors_dirty(d); + } + + mutex_unlock(&bch_register_lock); + + return ret; +} + static inline unsigned offset_to_stripe(struct bcache_device *d, uint64_t offset) { From fa92ff6b77a1ac597c2c8bb973c8a9c0191c108a Mon Sep 17 00:00:00 2001 From: Tony Asleson Date: Wed, 6 Sep 2017 14:25:57 +0800 Subject: [PATCH 075/555] bcache: Correct return value for sysfs attach errors commit 77fa100f27475d08a569b9d51c17722130f089e7 upstream. If you encounter any errors in bch_cached_dev_attach it will return a negative error code. The variable 'v' which stores the result is unsigned, thus user space sees a very large value returned for bytes written which can cause incorrect user space behavior. Utilize 1 signed variable to use throughout the function to preserve error return capability. Signed-off-by: Tony Asleson Acked-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b3ff57d61dde..4fbb5532f24c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -191,7 +191,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - unsigned v = size; + ssize_t v = size; struct cache_set *c; struct kobj_uevent_env *env; @@ -226,7 +226,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); + v = bch_read_string_list(buf, bch_cache_modes + 1); if (v < 0) return v; From 57aa1a6967b28431d62beec299125a969a469f3f Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:59 +0800 Subject: [PATCH 076/555] bcache: fix for gc and write-back race commit 9baf30972b5568d8b5bc8b3c46a6ec5b58100463 upstream. gc and write-back get raced (see the email "bcache get stucked" I sended before): gc thread write-back thread | |bch_writeback_thread() |bch_gc_thread() | | |==>read_dirty() |==>bch_btree_gc() | |==>btree_root() //get btree root | | //node write locker | |==>bch_btree_gc_root() | | |==>read_dirty_submit() | |==>write_dirty() | |==>continue_at(cl, | | write_dirty_finish, | | system_wq); | |==>write_dirty_finish()//excute | | //in system_wq | |==>bch_btree_insert() | |==>bch_btree_map_leaf_nodes() | |==>__bch_btree_map_nodes() | |==>btree_root //try to get btree | | //root node read | | //lock | |-----stuck here |==>bch_btree_set_root() |==>bch_journal_meta() |==>bch_journal() |==>journal_try_write() |==>journal_write_unlocked() //journal_full(&c->journal) | //condition satisfied |==>continue_at(cl, journal_write, system_wq); //try to excute | //journal_write in system_wq | //but work queue is excuting | //write_dirty_finish() |==>closure_sync(); //wait journal_write execute | //over and wake up gc, |-------------stuck here |==>release root node write locker This patch alloc a separate work-queue for write-back thread to avoid such race. (Commit log re-organized by Coly Li to pass checkpatch.pl checking) Signed-off-by: Tang Junhui Acked-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 ++ drivers/md/bcache/writeback.c | 9 +++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index c3ea03c9a1a8..02619cabda8b 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -333,6 +333,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; struct task_struct *writeback_thread; + struct workqueue_struct *writeback_write_wq; struct keybuf writeback_keys; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 9997127170ea..f4557f558b24 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1058,6 +1058,8 @@ static void cached_dev_free(struct closure *cl) cancel_delayed_work_sync(&dc->writeback_rate_update); if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); mutex_lock(&bch_register_lock); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8f7bd4755a85..4ce2b19fe120 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -187,7 +187,7 @@ static void write_dirty(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty_finish, system_wq); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } static void read_dirty_endio(struct bio *bio) @@ -207,7 +207,7 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty, system_wq); + continue_at(cl, write_dirty, io->dc->writeback_write_wq); } static void read_dirty(struct cached_dev *dc) @@ -517,6 +517,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { + dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", + WQ_MEM_RECLAIM, 0); + if (!dc->writeback_write_wq) + return -ENOMEM; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) From 08f75f2c525d786b348dca568761dfde8d6b0d5c Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Wed, 6 Sep 2017 14:26:02 +0800 Subject: [PATCH 077/555] bcache: fix bch_hprint crash and improve output commit 9276717b9e297a62d1151a43d1cd286213f68eb7 upstream. Most importantly, solve a crash where %llu was used to format signed numbers. This would cause a buffer overflow when reading sysfs writeback_rate_debug, as only 20 bytes were allocated for this and %llu writes 20 characters plus a null. Always use the units mechanism rather than having different output paths for simplicity. Also, correct problems with display output where 1.10 was a larger number than 1.09, by multiplying by 10 and then dividing by 1024 instead of dividing by 100. (Remainders of >= 1000 would print as .10). Minor changes: Always display the decimal point instead of trying to omit it based on number of digits shown. Decide what units to use based on 1000 as a threshold, not 1024 (in other words, always print at most 3 digits before the decimal point). Signed-off-by: Michael Lyle Reported-by: Dmitry Yu Okunev Acked-by: Kent Overstreet Reviewed-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/util.c | 42 +++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index dde6172f3f10..eb70f6894f05 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -73,24 +73,44 @@ STRTO_H(strtouint, unsigned int) STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) +/** + * bch_hprint() - formats @v to human readable string for sysfs. + * + * @v - signed 64 bit integer + * @buf - the (at least 8 byte) buffer to format the result into. + * + * Returns the number of bytes used by format. + */ ssize_t bch_hprint(char *buf, int64_t v) { static const char units[] = "?kMGTPEZY"; - char dec[4] = ""; - int u, t = 0; + int u = 0, t; - for (u = 0; v >= 1024 || v <= -1024; u++) { - t = v & ~(~0 << 10); - v >>= 10; - } + uint64_t q; - if (!u) - return sprintf(buf, "%llu", v); + if (v < 0) + q = -v; + else + q = v; - if (v < 100 && v > -100) - snprintf(dec, sizeof(dec), ".%i", t / 100); + /* For as long as the number is more than 3 digits, but at least + * once, shift right / divide by 1024. Keep the remainder for + * a digit after the decimal point. + */ + do { + u++; - return sprintf(buf, "%lli%s%c", v, dec, units[u]); + t = q & ~(~0 << 10); + q >>= 10; + } while (q >= 1000); + + if (v < 0) + /* '-', up to 3 digits, '.', 1 digit, 1 character, null; + * yields 8 bytes. + */ + return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]); + else + return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]); } ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], From d59dabdc4cb380b79c965af28cd4ba001f04834b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 Sep 2017 14:40:05 +0200 Subject: [PATCH 078/555] Linux 4.9.52 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b48aebbe187f..c53de1e38c6a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 51 +SUBLEVEL = 52 EXTRAVERSION = NAME = Roaring Lionus From 9b2b08179641b27615fb242bd41f8e0e537cb4f1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 3 Aug 2017 13:28:22 +0200 Subject: [PATCH 079/555] UPSTREAM: ipv6: fib: Unlink replaced routes from their nodes When a route is deleted its node pointer is set to NULL to indicate it's no longer linked to its node. Do the same for routes that are replaced. This will later allow us to test if a route is still in the FIB by checking its node pointer instead of its reference count. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Chenbo Feng Bug: 64978549 Cherry-pick from: 7483cea79957312e9f8e9cf760a1bc5d6c507113 Change-Id: Id92cba07ffa444522828b7f6760dc62c1899e7f2 --- net/ipv6/ip6_fib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5da864997495..7ba9a6ebf19d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -914,6 +914,7 @@ add: fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->rt6i_nsiblings; + iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; @@ -928,6 +929,7 @@ add: break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; + iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; From 291d9682291372a0592893b0b4bf312b896eea7e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 25 Sep 2017 08:55:09 -0700 Subject: [PATCH 080/555] FROMLIST: binder: fix use-after-free in binder_transaction() (from https://patchwork.kernel.org/patch/9978801/) User-space normally keeps the node alive when creating a transaction since it has a reference to the target. The local strong ref keeps it alive if the sending process dies before the target process processes the transaction. If the source process is malicious or has a reference counting bug, this can fail. In this case, when we attempt to decrement the node in the failure path, the node has already been freed. This is fixed by taking a tmpref on the node while constructing the transaction. To avoid re-acquiring the node lock and inner proc lock to increment the proc's tmpref, a helper is used that does the ref increments on both the node and proc. Bug: 66899329 Change-Id: Iad40e1e0bccee88234900494fb52a510a37fe8d7 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 95 ++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 28 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 1ac8008fb54d..fcf85be2674d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2743,6 +2743,48 @@ static bool binder_proc_transaction(struct binder_transaction *t, return true; } +/** + * binder_get_node_refs_for_txn() - Get required refs on node for txn + * @node: struct binder_node for which to get refs + * @proc: returns @node->proc if valid + * @error: if no @proc then returns BR_DEAD_REPLY + * + * User-space normally keeps the node alive when creating a transaction + * since it has a reference to the target. The local strong ref keeps it + * alive if the sending process dies before the target process processes + * the transaction. If the source process is malicious or has a reference + * counting bug, relying on the local strong ref can fail. + * + * Since user-space can cause the local strong ref to go away, we also take + * a tmpref on the node to ensure it survives while we are constructing + * the transaction. We also need a tmpref on the proc while we are + * constructing the transaction, so we take that here as well. + * + * Return: The target_node with refs taken or NULL if no @node->proc is NULL. + * Also sets @proc if valid. If the @node->proc is NULL indicating that the + * target proc has died, @error is set to BR_DEAD_REPLY + */ +static struct binder_node *binder_get_node_refs_for_txn( + struct binder_node *node, + struct binder_proc **procp, + uint32_t *error) +{ + struct binder_node *target_node = NULL; + + binder_node_inner_lock(node); + if (node->proc) { + target_node = node; + binder_inc_node_nilocked(node, 1, 0, NULL); + binder_inc_node_tmpref_ilocked(node); + node->proc->tmp_ref++; + *procp = node->proc; + } else + *error = BR_DEAD_REPLY; + binder_node_inner_unlock(node); + + return target_node; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -2845,43 +2887,35 @@ static void binder_transaction(struct binder_proc *proc, ref = binder_get_ref_olocked(proc, tr->target.handle, true); if (ref) { - binder_inc_node(ref->node, 1, 0, NULL); - target_node = ref->node; + target_node = binder_get_node_refs_for_txn( + ref->node, &target_proc, + &return_error); + } else { + binder_user_error("%d:%d got transaction to invalid handle\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); - if (target_node == NULL) { - binder_user_error("%d:%d got transaction to invalid handle\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; - return_error_line = __LINE__; - goto err_invalid_target_handle; - } } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; - if (target_node == NULL) { + if (target_node) + target_node = binder_get_node_refs_for_txn( + target_node, &target_proc, + &return_error); + else return_error = BR_DEAD_REPLY; - mutex_unlock(&context->context_mgr_node_lock); - return_error_line = __LINE__; - goto err_no_context_mgr_node; - } - binder_inc_node(target_node, 1, 0, NULL); mutex_unlock(&context->context_mgr_node_lock); } - e->to_node = target_node->debug_id; - binder_node_lock(target_node); - target_proc = target_node->proc; - if (target_proc == NULL) { - binder_node_unlock(target_node); - return_error = BR_DEAD_REPLY; + if (!target_node) { + /* + * return_error is set above + */ + return_error_param = -EINVAL; return_error_line = __LINE__; goto err_dead_binder; } - binder_inner_proc_lock(target_proc); - target_proc->tmp_ref++; - binder_inner_proc_unlock(target_proc); - binder_node_unlock(target_node); + e->to_node = target_node->debug_id; if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) { return_error = BR_FAILED_REPLY; @@ -3240,6 +3274,8 @@ static void binder_transaction(struct binder_proc *proc, if (target_thread) binder_thread_dec_tmpref(target_thread); binder_proc_dec_tmpref(target_proc); + if (target_node) + binder_dec_node_tmpref(target_node); /* * write barrier to synchronize with initialization * of log entry @@ -3259,6 +3295,8 @@ err_bad_parent: err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, t->buffer, offp); + if (target_node) + binder_dec_node_tmpref(target_node); target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); @@ -3273,13 +3311,14 @@ err_bad_call_stack: err_empty_call_stack: err_dead_binder: err_invalid_target_handle: -err_no_context_mgr_node: if (target_thread) binder_thread_dec_tmpref(target_thread); if (target_proc) binder_proc_dec_tmpref(target_proc); - if (target_node) + if (target_node) { binder_dec_node(target_node, 1, 0); + binder_dec_node_tmpref(target_node); + } binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n", From 96a28fcc7c92caac76a146f4327cbfffd068990b Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 27 Sep 2017 17:18:48 -0700 Subject: [PATCH 081/555] ANDROID: add script to fetch android kernel config fragments The Android kernel config fragments now live in a separate repository. To prevent others from having to search for this location, add a script to fetch and unpack the fragments. Update .gitignore to include these fragments. Change-Id: If2d4a59b86e4573b0a9b3190025dfe4191870b46 Signed-off-by: Steve Muckle --- .gitignore | 3 +++ kernel/configs/android-fetch-configs.sh | 4 ++++ 2 files changed, 7 insertions(+) create mode 100755 kernel/configs/android-fetch-configs.sh diff --git a/.gitignore b/.gitignore index c2ed4ecb0acd..d47ecbb26d72 100644 --- a/.gitignore +++ b/.gitignore @@ -114,3 +114,6 @@ all.config # Kdevelop4 *.kdev4 + +# fetched Android config fragments +kernel/configs/android-*.cfg diff --git a/kernel/configs/android-fetch-configs.sh b/kernel/configs/android-fetch-configs.sh new file mode 100755 index 000000000000..a5b56d4908dc --- /dev/null +++ b/kernel/configs/android-fetch-configs.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +curl https://android.googlesource.com/kernel/configs/+archive/master/android-4.9.tar.gz | tar xzv + From 9a7bc3f0c76a49c71f507418c172f2af733ddea1 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Thu, 7 Sep 2017 16:03:27 +0800 Subject: [PATCH 082/555] cifs: release cifs root_cred after exit_cifs commit 94183331e815617246b1baa97e0916f358c794bb upstream. memory leak was found by kmemleak. exit_cifs_spnego should be called before cifs module removed, or cifs root_cred will not be released. kmemleak report: unreferenced object 0xffff880070a3ce40 (size 192): backtrace: kmemleak_alloc+0x4a/0xa0 kmem_cache_alloc+0xc7/0x1d0 prepare_kernel_cred+0x20/0x120 init_cifs_spnego+0x2d/0x170 [cifs] 0xffffffffc07801f3 do_one_initcall+0x51/0x1b0 do_init_module+0x60/0x1fd load_module+0x161e/0x1b60 SYSC_finit_module+0xa9/0x100 SyS_finit_module+0xe/0x10 Signed-off-by: Shu Wang Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c0c253005b76..87658f63b374 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1360,7 +1360,7 @@ exit_cifs(void) exit_cifs_idmap(); #endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); + exit_cifs_spnego(); #endif cifs_destroy_request_bufs(); cifs_destroy_mids(); From b6a77c7ba6741240d7bbf5b520e6eb93d3a5b211 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Fri, 8 Sep 2017 18:48:33 +0800 Subject: [PATCH 083/555] cifs: release auth_key.response for reconnect. commit f5c4ba816315d3b813af16f5571f86c8d4e897bd upstream. There is a race that cause cifs reconnect in cifs_mount, - cifs_mount - cifs_get_tcp_session - [ start thread cifs_demultiplex_thread - cifs_read_from_socket: -ECONNABORTED - DELAY_WORK smb2_reconnect_server ] - cifs_setup_session - [ smb2_reconnect_server ] auth_key.response was allocated in cifs_setup_session, and will release when the session destoried. So when session re- connect, auth_key.response should be check and released. Tested with my system: CIFS VFS: Free previous auth_key.response = ffff8800320bbf80 A simple auth_key.response allocation call trace: - cifs_setup_session - SMB2_sess_setup - SMB2_sess_auth_rawntlmssp_authenticate - build_ntlmssp_auth_blob - setup_ntlmv2_rsp Signed-off-by: Shu Wang Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1a545695f547..f6712b6128d8 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4071,6 +4071,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); + if (ses->auth_key.response) { + cifs_dbg(VFS, "Free previous auth_key.response = %p\n", + ses->auth_key.response); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + } + if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); From 9ad15a25669ec8c120e910cf18620af2455cb6df Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 14 Sep 2017 11:42:17 +0200 Subject: [PATCH 084/555] fs/proc: Report eip/esp in /prod/PID/stat for coredumping commit fd7d56270b526ca3ed0c224362e3c64a0f86687a upstream. Commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") stopped reporting eip/esp because it is racy and dangerous for executing tasks. The comment adds: As far as I know, there are no use programs that make any material use of these fields, so just get rid of them. However, existing userspace core-dump-handler applications (for example, minicoredumper) are using these fields since they provide an excellent cross-platform interface to these valuable pointers. So that commit introduced a user space visible regression. Partially revert the change and make the readout possible for tasks with the proper permissions and only if the target task has the PF_DUMPCORE flag set. Fixes: 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in> /proc/PID/stat") Reported-by: Marco Felsch Signed-off-by: John Ogness Reviewed-by: Andy Lutomirski Cc: Tycho Andersen Cc: Kees Cook Cc: Peter Zijlstra Cc: Brian Gerst Cc: Tetsuo Handa Cc: Borislav Petkov Cc: Al Viro Cc: Linux API Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/87poatfwg6.fsf@linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- fs/proc/array.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/proc/array.c b/fs/proc/array.c index 81818adb8e9e..c932ec454625 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -416,7 +417,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * esp and eip are intentionally zeroed out. There is no * non-racy way to read them without freezing the task. * Programs that need reliable values can use ptrace(2). + * + * The only exception is if the task is core dumping because + * a program is not able to use ptrace(2) in that case. It is + * safe because the task has stopped executing permanently. */ + if (permitted && (task->flags & PF_DUMPCORE)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + } } get_task_comm(tcomm, task); From 59862b0429d98f958c44a9e51cb317773ee28ba9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Jun 2017 12:20:30 +0200 Subject: [PATCH 085/555] mac80211: fix VLAN handling with TXQs commit 53168215909281a09d3afc6fb51a9d4f81f74d39 upstream. With TXQs, the AP_VLAN interfaces are resolved to their owner AP interface when enqueuing the frame, which makes sense since the frame really goes out on that as far as the driver is concerned. However, this introduces a problem: frames to be encrypted with a VLAN-specific GTK will now be encrypted with the AP GTK, since the information about which virtual interface to use to select the key is taken from the TXQ. Fix this by preserving info->control.vif and using that in the dequeue function. This now requires doing the driver-mapping in the dequeue as well. Since there's no way to filter the frames that are sitting on a TXQ, drop all frames, which may affect other interfaces, when an AP_VLAN is removed. Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- include/net/mac80211.h | 15 ++------------- net/mac80211/iface.c | 17 +++++++++++++++-- net/mac80211/tx.c | 36 +++++++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e2dba93e374f..2c7d876e2a1a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -902,21 +902,10 @@ struct ieee80211_tx_info { unsigned long jiffies; }; /* NB: vif can be NULL for injected frames */ - union { - /* NB: vif can be NULL for injected frames */ - struct ieee80211_vif *vif; - - /* When packets are enqueued on txq it's easy - * to re-construct the vif pointer. There's no - * more space in tx_info so it can be used to - * store the necessary enqueue time for packet - * sojourn time computation. - */ - codel_time_t enqueue_time; - }; + struct ieee80211_vif *vif; struct ieee80211_key_conf *hw_key; u32 flags; - /* 4 bytes free */ + codel_time_t enqueue_time; } control; struct { u64 cookie; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 37bec0f864b7..a7aa54f45e19 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -791,6 +791,7 @@ static int ieee80211_open(struct net_device *dev) static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { + struct ieee80211_sub_if_data *txq_sdata = sdata; struct ieee80211_local *local = sdata->local; struct fq *fq = &local->fq; unsigned long flags; @@ -931,6 +932,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: + txq_sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); @@ -1001,8 +1005,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - if (sdata->vif.txq) { - struct txq_info *txqi = to_txq_info(sdata->vif.txq); + if (txq_sdata->vif.txq) { + struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq); + + /* + * FIXME FIXME + * + * We really shouldn't purge the *entire* txqi since that + * contains frames for the other AP_VLANs (and possibly + * the AP itself) as well, but there's no API in FQ now + * to be able to filter. + */ spin_lock_bh(&fq->lock); ieee80211_txq_purge(local, txqi); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index dd190ff3daea..274c564bd9af 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1277,11 +1277,6 @@ static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); } -static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi) -{ - IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif; -} - static u32 codel_skb_len_func(const struct sk_buff *skb) { return skb->len; @@ -3388,6 +3383,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; + struct ieee80211_vif *vif; spin_lock_bh(&fq->lock); @@ -3404,8 +3400,6 @@ begin: if (!skb) goto out; - ieee80211_set_skb_vif(skb, txqi); - hdr = (struct ieee80211_hdr *)skb->data; info = IEEE80211_SKB_CB(skb); @@ -3462,6 +3456,34 @@ begin: } } + switch (tx.sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + vif = &tx.sdata->vif; + break; + } + tx.sdata = rcu_dereference(local->monitor_sdata); + if (tx.sdata) { + vif = &tx.sdata->vif; + info->hw_queue = + vif->hw_queue[skb_get_queue_mapping(skb)]; + } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } else { + vif = NULL; + } + break; + case NL80211_IFTYPE_AP_VLAN: + tx.sdata = container_of(tx.sdata->bss, + struct ieee80211_sub_if_data, u.ap); + /* fall through */ + default: + vif = &tx.sdata->vif; + break; + } + + IEEE80211_SKB_CB(skb)->control.vif = vif; out: spin_unlock_bh(&fq->lock); From e7e0f0dda28b7b3b183e5c2cdb0db0f0230d285d Mon Sep 17 00:00:00 2001 From: Beni Lev Date: Tue, 25 Jul 2017 11:25:25 +0300 Subject: [PATCH 086/555] mac80211_hwsim: Use proper TX power commit 9de981f507474f326e42117858dc9a9321331ae5 upstream. In struct ieee80211_tx_info, control.vif pointer and rate_driver_data[0] falls on the same place, depending on the union usage. During the whole TX process, the union is referred to as a control struct, which holds the vif that is later used in the tx flow, especially in order to derive the used tx power. Referring direcly to rate_driver_data[0] and assigning a value to it, overwrites the vif pointer, hence making all later references irrelevant. Moreover, rate_driver_data[0] isn't used later in the flow in order to retrieve the channel that it is pointing to. Signed-off-by: Beni Lev Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/mac80211_hwsim.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 0fd7d7ed07ce..c06932c5ecdb 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1357,8 +1357,6 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, txi->control.rates, ARRAY_SIZE(txi->control.rates)); - txi->rate_driver_data[0] = channel; - if (skb->len >= 24 + 8 && ieee80211_is_probe_resp(hdr->frame_control)) { /* fake header transmission time */ From e167b4ad529b4753fa225c28faf4163cce22806a Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 18 Aug 2017 15:33:57 +0300 Subject: [PATCH 087/555] mac80211: flush hw_roc_start work before cancelling the ROC commit 6e46d8ce894374fc135c96a8d1057c6af1fef237 upstream. When HW ROC is supported it is possible that after the HW notified that the ROC has started, the ROC was cancelled and another ROC was added while the hw_roc_start worker is waiting on the mutex (since cancelling the ROC and adding another one also holds the same mutex). As a result, the hw_roc_start worker will continue to run after the new ROC is added but before it is actually started by the HW. This may result in notifying userspace that the ROC has started before it actually does, or in case of management tx ROC, in an attempt to tx while not on the right channel. In addition, when the driver will notify mac80211 that the second ROC has started, mac80211 will warn that this ROC has already been notified. Fix this by flushing the hw_roc_start work before cancelling an ROC. Cc: stable@vger.kernel.org Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/offchannel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index eede5c6db8d5..30bba53c2992 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -707,6 +707,8 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, if (!cookie) return -ENOENT; + flush_work(&local->hw_roc_start); + mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!mgmt_tx && roc->cookie != cookie) From 3d5960c8c657702bc722f0e801e24487f040980c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Sep 2017 10:12:20 +0200 Subject: [PATCH 088/555] genirq: Make sparse_irq_lock protect what it should protect commit 12ac1d0f6c3e95732d144ffa65c8b20fbd9aa462 upstream. for_each_active_irq() iterates the sparse irq allocation bitmap. The caller must hold sparse_irq_lock. Several code pathes expect that an active bit in the sparse bitmap also has a valid interrupt descriptor. Unfortunately that's not true. The (de)allocation is a two step process, which holds the sparse_irq_lock only across the queue/remove from the radix tree and the set/clear in the allocation bitmap. If a iteration locks sparse_irq_lock between the two steps, then it might see an active bit but the corresponding irq descriptor is NULL. If that is dereferenced unconditionally, then the kernel oopses. Of course, all iterator sites could be audited and fixed, but.... There is no reason why the sparse_irq_lock needs to be dropped between the two steps, in fact the code becomes simpler when the mutex is held across both and the semantics become more straight forward, so future problems of missing NULL pointer checks in the iteration are avoided and all existing sites are fixed in one go. Expand the lock held sections so both operations are covered and the bitmap and the radixtree are in sync. Fixes: a05a900a51c7 ("genirq: Make sparse_lock a mutex") Reported-and-tested-by: Huang Ying Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/irqdesc.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 00bb0aeea1d0..77977f55dff7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -405,10 +405,8 @@ static void free_desc(unsigned int irq) * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ - mutex_lock(&sparse_irq_lock); kobject_del(&desc->kobj); delete_irq_desc(irq); - mutex_unlock(&sparse_irq_lock); /* * We free the descriptor, masks and stat fields via RCU. That @@ -446,20 +444,15 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; - mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); - mutex_unlock(&sparse_irq_lock); } + bitmap_set(allocated_irqs, start, cnt); return start; err: for (i--; i >= 0; i--) free_desc(start + i); - - mutex_lock(&sparse_irq_lock); - bitmap_clear(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); return -ENOMEM; } @@ -558,6 +551,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, desc->owner = owner; } + bitmap_set(allocated_irqs, start, cnt); return start; } @@ -653,10 +647,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt) if (from >= nr_irqs || (from + cnt) > nr_irqs) return; + mutex_lock(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); - mutex_lock(&sparse_irq_lock); bitmap_clear(allocated_irqs, from, cnt); mutex_unlock(&sparse_irq_lock); } @@ -703,19 +697,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) - goto err; + goto unlock; if (start + cnt > nr_irqs) { ret = irq_expand_nr_irqs(start + cnt); if (ret) - goto err; + goto unlock; } - - bitmap_set(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, affinity, owner); - -err: + ret = alloc_descs(start, cnt, node, affinity, owner); +unlock: mutex_unlock(&sparse_irq_lock); return ret; } From 18b7919a9de8a64a118346552987a07a80b70679 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 Sep 2017 15:41:49 +1000 Subject: [PATCH 089/555] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce() commit 47c5310a8dbe7c2cb9f0083daa43ceed76c257fa upstream, with part of commit edd03602d97236e8fea13cd76886c576186aa307 folded in. Nixiaoming pointed out that there is a memory leak in kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor are the pages allocated for the iommu tables. In addition, we have already incremented the process's count of locked memory pages, and this doesn't get restored on error. David Hildenbrand pointed out that there is a race in that the function checks early on that there is not already an entry in the stt->iommu_tables list with the same LIOBN, but an entry with the same LIOBN could get added between then and when the new entry is added to the list. This fixes all three problems. To simplify things, we now call anon_inode_getfd() before placing the new entry in the list. The check for an existing entry is done while holding the kvm->lock mutex, immediately before adding the new entry to the list. Finally, on failure we now call kvmppc_account_memlimit to decrement the process's count of locked memory pages. [paulus@ozlabs.org - folded in that part of edd03602d972 ("KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list", 2017-08-28) which restructured the code that 47c5310a8dbe modified, to avoid a build failure caused by the absence of put_unused_fd().] Fixes: 54738c097163 ("KVM: PPC: Accelerate H_PUT_TCE by implementing it in real mode") Fixes: f8626985c7c2 ("KVM: PPC: Account TCE-containing pages in locked_vm") Reported-by: Nixiaoming Reported-by: David Hildenbrand Signed-off-by: Paul Mackerras Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_64_vio.c | 54 ++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index c379ff5a4438..7c1cb9dcd9a0 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -150,6 +150,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; + struct kvmppc_spapr_tce_table *siter; unsigned long npages, size; int ret = -ENOMEM; int i; @@ -157,24 +158,16 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, if (!args->size) return -EINVAL; - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - size = args->size; npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); - if (ret) { - stt = NULL; - goto fail; - } + if (ret) + return ret; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); if (!stt) - goto fail; + goto fail_acct; stt->liobn = args->liobn; stt->page_shift = args->page_shift; @@ -188,24 +181,39 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; } - kvm_get_kvm(kvm); - mutex_lock(&kvm->lock); - list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + + /* Check this LIOBN hasn't been previously allocated */ + ret = 0; + list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) { + if (siter->liobn == args->liobn) { + ret = -EBUSY; + break; + } + } + + if (!ret) + ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + + if (ret >= 0) { + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + kvm_get_kvm(kvm); + } mutex_unlock(&kvm->lock); - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR | O_CLOEXEC); + if (ret >= 0) + return ret; -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); + fail: + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); - kfree(stt); - } + kfree(stt); + fail_acct: + kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); return ret; } From 8dcf70ab1830a98807089f592cc4bd89cda50083 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 Sep 2017 15:42:38 +1000 Subject: [PATCH 090/555] KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list commit edd03602d97236e8fea13cd76886c576186aa307 upstream. Al Viro pointed out that while one thread of a process is executing in kvm_vm_ioctl_create_spapr_tce(), another thread could guess the file descriptor returned by anon_inode_getfd() and close() it before the first thread has added it to the kvm->arch.spapr_tce_tables list. That highlights a more general problem: there is no mutual exclusion between writers to the spapr_tce_tables list, leading to the possibility of the list becoming corrupted, which could cause a host kernel crash. To fix the mutual exclusion problem, we add a mutex_lock/unlock pair around the list_del_rce in kvm_spapr_tce_release(). If another thread does guess the file descriptor returned by the anon_inode_getfd() call in kvm_vm_ioctl_create_spapr_tce() and closes it, its call to kvm_spapr_tce_release() will not do any harm because it will have to wait until the first thread has released kvm->lock. The other things that the second thread could do with the guessed file descriptor are to mmap it or to pass it as a parameter to a KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl on a KVM device fd. An mmap call won't cause any harm because kvm_spapr_tce_mmap() and kvm_spapr_tce_fault() don't access the spapr_tce_tables list or the kvmppc_spapr_tce_table.list field, and the fields that they do use have been properly initialized by the time of the anon_inode_getfd() call. The KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl calls kvm_spapr_tce_attach_iommu_group(), which scans the spapr_tce_tables list looking for the kvmppc_spapr_tce_table struct corresponding to the fd given as the parameter. Either it will find the new entry or it won't; if it doesn't, it just returns an error, and if it does, it will function normally. So, in each case there is no harmful effect. [paulus@ozlabs.org - moved parts of the upstream patch into the backport of 47c5310a8dbe, adjusted this commit message accordingly.] Fixes: 366baf28ee3f ("KVM: PPC: Use RCU for arch.spapr_tce_tables") Reviewed-by: David Gibson Signed-off-by: Paul Mackerras Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_64_vio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 7c1cb9dcd9a0..da2a7eccb10a 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -129,8 +129,11 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) { struct kvmppc_spapr_tce_table *stt = filp->private_data; + struct kvm *kvm = stt->kvm; + mutex_lock(&kvm->lock); list_del_rcu(&stt->list); + mutex_unlock(&kvm->lock); kvm_put_kvm(stt->kvm); From 97d402e6eed2f7d867a9f57d9d35a968c1440fe8 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sun, 17 Sep 2017 03:23:48 -0700 Subject: [PATCH 091/555] tracing: Fix trace_pipe behavior for instance traces commit 75df6e688ccd517e339a7c422ef7ad73045b18a2 upstream. When reading data from trace_pipe, tracing_wait_pipe() performs a check to see if tracing has been turned off after some data was read. Currently, this check always looks at global trace state, but it should be checking the trace instance where trace_pipe is located at. Because of this bug, cat instances/i1/trace_pipe in the following script will immediately exit instead of waiting for data: cd /sys/kernel/debug/tracing echo 0 > tracing_on mkdir -p instances/i1 echo 1 > instances/i1/tracing_on echo 1 > instances/i1/events/sched/sched_process_exec/enable cat instances/i1/trace_pipe Link: http://lkml.kernel.org/r/20170917102348.1615-1-tahsin@google.com Fixes: 10246fa35d4f ("tracing: give easy way to clear trace buffer") Signed-off-by: Tahsin Erdogan Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f95bf81529f5..3074fc5f713c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5128,7 +5128,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); From 5fb4be27dac5f0ad925604acf4b5984fe8271551 Mon Sep 17 00:00:00 2001 From: Bo Yan Date: Mon, 18 Sep 2017 10:03:35 -0700 Subject: [PATCH 092/555] tracing: Erase irqsoff trace with empty write commit 8dd33bcb7050dd6f8c1432732f930932c9d3a33e upstream. One convenient way to erase trace is "echo > trace". However, this is currently broken if the current tracer is irqsoff tracer. This is because irqsoff tracer use max_buffer as the default trace buffer. Set the max_buffer as the one to be cleared when it's the trace buffer currently in use. Link: http://lkml.kernel.org/r/1505754215-29411-1-git-send-email-byan@nvidia.com Cc: Fixes: 4acd4d00f ("tracing: give easy way to clear trace buffer") Signed-off-by: Bo Yan Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3074fc5f713c..c1e50cc0d7b0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3569,11 +3569,17 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); + struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, cpu); + tracing_reset(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { From 648798cc2fd7d748573ba760a66cfa7b561abe77 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 25 Aug 2017 10:40:02 -0700 Subject: [PATCH 093/555] md/raid5: fix a race condition in stripe batch commit 3664847d95e60a9a943858b7800f8484669740fc upstream. We have a race condition in below scenario, say have 3 continuous stripes, sh1, sh2 and sh3, sh1 is the stripe_head of sh2 and sh3: CPU1 CPU2 CPU3 handle_stripe(sh3) stripe_add_to_batch_list(sh3) -> lock(sh2, sh3) -> lock batch_lock(sh1) -> add sh3 to batch_list of sh1 -> unlock batch_lock(sh1) clear_batch_ready(sh1) -> lock(sh1) and batch_lock(sh1) -> clear STRIPE_BATCH_READY for all stripes in batch_list -> unlock(sh1) and batch_lock(sh1) ->clear_batch_ready(sh3) -->test_and_clear_bit(STRIPE_BATCH_READY, sh3) --->return 0 as sh->batch == NULL -> sh3->batch_head = sh1 -> unlock (sh2, sh3) In CPU1, handle_stripe will continue handle sh3 even it's in batch stripe list of sh1. By moving sh3->batch_head assignment in to batch_lock, we make it impossible to clear STRIPE_BATCH_READY before batch_head is set. Thanks Stephane for helping debug this tricky issue. Reported-and-tested-by: Stephane Thiell Signed-off-by: Shaohua Li Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid5.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 549b4afd12e1..349e1bd3db69 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -829,6 +829,14 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh spin_unlock(&head->batch_head->batch_lock); goto unlock_out; } + /* + * We must assign batch_head of this stripe within the + * batch_lock, otherwise clear_batch_ready of batch head + * stripe could clear BATCH_READY bit of this stripe and + * this stripe->batch_head doesn't get assigned, which + * could confuse clear_batch_ready for this stripe + */ + sh->batch_head = head->batch_head; /* * at this point, head's BATCH_READY could be cleared, but we @@ -836,8 +844,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh */ list_add(&sh->batch_list, &head->batch_list); spin_unlock(&head->batch_head->batch_lock); - - sh->batch_head = head->batch_head; } else { head->batch_head = head; sh->batch_head = head->batch_head; From 49c2b839b743dfd8e3b6332494eba00ef47389a3 Mon Sep 17 00:00:00 2001 From: Dennis Yang Date: Wed, 6 Sep 2017 11:02:35 +0800 Subject: [PATCH 094/555] md/raid5: preserve STRIPE_ON_UNPLUG_LIST in break_stripe_batch_list commit 184a09eb9a2fe425e49c9538f1604b05ed33cfef upstream. In release_stripe_plug(), if a stripe_head has its STRIPE_ON_UNPLUG_LIST set, it indicates that this stripe_head is already in the raid5_plug_cb list and release_stripe() would be called instead to drop a reference count. Otherwise, the STRIPE_ON_UNPLUG_LIST bit would be set for this stripe_head and it will get queued into the raid5_plug_cb list. Since break_stripe_batch_list() did not preserve STRIPE_ON_UNPLUG_LIST, A stripe could be re-added to plug list while it is still on that list in the following situation. If stripe_head A is added to another stripe_head B's batch list, in this case A will have its batch_head != NULL and be added into the plug list. After that, stripe_head B gets handled and called break_stripe_batch_list() to reset all the batched stripe_head(including A which is still on the plug list)'s state and reset their batch_head to NULL. Before the plug list gets processed, if there is another write request comes in and get stripe_head A, A will have its batch_head == NULL (cleared by calling break_stripe_batch_list() on B) and be added to plug list once again. Signed-off-by: Dennis Yang Signed-off-by: Shaohua Li Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 349e1bd3db69..7aea0221530c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4283,7 +4283,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED)), + (1 << STRIPE_DEGRADED) | + (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); sh->check_state = head_sh->check_state; From b42bf0f15cf70926f3a460e7517703fda6191ba7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 27 Aug 2017 20:25:26 +0800 Subject: [PATCH 095/555] scsi: scsi_transport_iscsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly commit c88f0e6b06f4092995688211a631bb436125d77b upstream. ChunYu found a kernel crash by syzkaller: [ 651.617875] kasan: CONFIG_KASAN_INLINE enabled [ 651.618217] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 651.618731] general protection fault: 0000 [#1] SMP KASAN [ 651.621543] CPU: 1 PID: 9539 Comm: scsi Not tainted 4.11.0.cov #32 [ 651.621938] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 651.622309] task: ffff880117780000 task.stack: ffff8800a3188000 [ 651.622762] RIP: 0010:skb_release_data+0x26c/0x590 [...] [ 651.627260] Call Trace: [ 651.629156] skb_release_all+0x4f/0x60 [ 651.629450] consume_skb+0x1a5/0x600 [ 651.630705] netlink_unicast+0x505/0x720 [ 651.632345] netlink_sendmsg+0xab2/0xe70 [ 651.633704] sock_sendmsg+0xcf/0x110 [ 651.633942] ___sys_sendmsg+0x833/0x980 [ 651.637117] __sys_sendmsg+0xf3/0x240 [ 651.638820] SyS_sendmsg+0x32/0x50 [ 651.639048] entry_SYSCALL_64_fastpath+0x1f/0xc2 It's caused by skb_shared_info at the end of sk_buff was overwritten by ISCSI_KEVENT_IF_ERROR when parsing nlmsg info from skb in iscsi_if_rx. During the loop if skb->len == nlh->nlmsg_len and both are sizeof(*nlh), ev = nlmsg_data(nlh) will acutally get skb_shinfo(SKB) instead and set a new value to skb_shinfo(SKB)->nr_frags by ev->type. This patch is to fix it by checking nlh->nlmsg_len properly there to avoid over accessing sk_buff. Reported-by: ChunYu Wang Signed-off-by: Xin Long Acked-by: Chris Leech Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_transport_iscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 42bca619f854..c39551b32e94 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -3696,7 +3696,7 @@ iscsi_if_rx(struct sk_buff *skb) uint32_t group; nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || + if (nlh->nlmsg_len < sizeof(*nlh) + sizeof(*ev) || skb->len < nlh->nlmsg_len) { break; } From 29825768590ef7f180783fd7b808e69eeb687b83 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 15 Sep 2017 11:55:27 -0400 Subject: [PATCH 096/555] drm/radeon: disable hard reset in hibernate for APUs commit 820608548737e315c6f93e3099b4e65bde062334 upstream. Fixes a hibernation regression on APUs. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=191571 Fixes: 274ad65c9d02bdc (drm/radeon: hard reset r600 and newer GPU when hibernating.) Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 3b21ca5a6c81..82b01123c386 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1674,7 +1674,7 @@ int radeon_suspend_kms(struct drm_device *dev, bool suspend, radeon_agp_suspend(rdev); pci_save_state(dev->pdev); - if (freeze && rdev->family >= CHIP_CEDAR) { + if (freeze && rdev->family >= CHIP_CEDAR && !(rdev->flags & RADEON_IS_IGP)) { rdev->asic->asic_reset(rdev, true); pci_restore_state(dev->pdev); } else if (suspend) { From 7e1b2b2db3d744f85d0c9d3ae85cbe1a6082721b Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Thu, 14 Sep 2017 17:10:28 +0200 Subject: [PATCH 097/555] crypto: drbg - fix freeing of resources commit bd6227a150fdb56e7bb734976ef6e53a2c1cb334 upstream. During the change to use aligned buffers, the deallocation code path was not updated correctly. The current code tries to free the aligned buffer pointer and not the original buffer pointer as it is supposed to. Thus, the code is updated to free the original buffer pointer and set the aligned buffer pointer that is used throughout the code to NULL. Fixes: 3cfc3b9721123 ("crypto: drbg - use aligned buffers") CC: Herbert Xu Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/drbg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crypto/drbg.c b/crypto/drbg.c index 8cac3d31a5f8..942ddff68408 100644 --- a/crypto/drbg.c +++ b/crypto/drbg.c @@ -1133,10 +1133,10 @@ static inline void drbg_dealloc_state(struct drbg_state *drbg) { if (!drbg) return; - kzfree(drbg->V); - drbg->Vbuf = NULL; - kzfree(drbg->C); - drbg->Cbuf = NULL; + kzfree(drbg->Vbuf); + drbg->V = NULL; + kzfree(drbg->Cbuf); + drbg->C = NULL; kzfree(drbg->scratchpadbuf); drbg->scratchpadbuf = NULL; drbg->reseed_ctr = 0; From 70117b7735983978b695cf203da7386a26a7e0bb Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 12 Sep 2017 11:03:39 +0200 Subject: [PATCH 098/555] crypto: talitos - Don't provide setkey for non hmac hashing algs. commit 56136631573baa537a15e0012055ffe8cfec1a33 upstream. Today, md5sum fails with error -ENOKEY because a setkey function is set for non hmac hashing algs, see strace output below: mmap(NULL, 378880, PROT_READ, MAP_SHARED, 6, 0) = 0x77f50000 accept(3, 0, NULL) = 7 vmsplice(5, [{"bin/\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 378880}], 1, SPLICE_F_MORE|SPLICE_F_GIFT) = 262144 splice(4, NULL, 7, NULL, 262144, SPLICE_F_MORE) = -1 ENOKEY (Required key not available) write(2, "Generation of hash for file kcap"..., 50) = 50 munmap(0x77f50000, 378880) = 0 This patch ensures that setkey() function is set only for hmac hashing. Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/talitos.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 571de2f284cf..f6f811a1bbe1 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -3057,7 +3057,8 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev, t_alg->algt.alg.hash.final = ahash_final; t_alg->algt.alg.hash.finup = ahash_finup; t_alg->algt.alg.hash.digest = ahash_digest; - t_alg->algt.alg.hash.setkey = ahash_setkey; + if (!strncmp(alg->cra_name, "hmac", 4)) + t_alg->algt.alg.hash.setkey = ahash_setkey; t_alg->algt.alg.hash.import = ahash_import; t_alg->algt.alg.hash.export = ahash_export; From 1492259fc324d29184801b6244900abba394301e Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Wed, 13 Sep 2017 12:44:51 +0200 Subject: [PATCH 099/555] crypto: talitos - fix sha224 commit afd62fa26343be6445479e75de9f07092a061459 upstream. Kernel crypto tests report the following error at startup [ 2.752626] alg: hash: Test 4 failed for sha224-talitos [ 2.757907] 00000000: 30 e2 86 e2 e7 8a dd 0d d7 eb 9f d5 83 fe f1 b0 00000010: 2d 5a 6c a5 f9 55 ea fd 0e 72 05 22 This patch fixes it Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/talitos.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index f6f811a1bbe1..d5e73003a0cc 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1756,9 +1756,9 @@ static int common_nonsnoop_hash(struct talitos_edesc *edesc, req_ctx->swinit = 0; } else { desc->ptr[1] = zero_entry; - /* Indicate next op is not the first. */ - req_ctx->first = 0; } + /* Indicate next op is not the first. */ + req_ctx->first = 0; /* HMAC key */ if (ctx->keylen) From b60f791ef32db376d55f047748615baee9dc63ef Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Wed, 13 Sep 2017 12:44:57 +0200 Subject: [PATCH 100/555] crypto: talitos - fix hashing commit 886a27c0fc8a34633aadb0986dba11d8c150ae2e upstream. md5sum on some files gives wrong result Exemple: With the md5sum from libkcapi: c15115c05bad51113f81bdaee735dd09 test With the original md5sum: bbdf41d80ba7e8b2b7be3a0772be76cb test This patch fixes this issue Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/talitos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index d5e73003a0cc..e2d323fa2437 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1769,7 +1769,7 @@ static int common_nonsnoop_hash(struct talitos_edesc *edesc, sg_count = edesc->src_nents ?: 1; if (is_sec1 && sg_count > 1) - sg_copy_to_buffer(areq->src, sg_count, edesc->buf, length); + sg_copy_to_buffer(req_ctx->psrc, sg_count, edesc->buf, length); else sg_count = dma_map_sg(dev, req_ctx->psrc, sg_count, DMA_TO_DEVICE); From 2f9be92dfffec82fbaebc9ff32c749b5764fea19 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 20 Sep 2017 16:58:38 +0200 Subject: [PATCH 101/555] security/keys: properly zero out sensitive key material in big_key commit 910801809b2e40a4baedd080ef5d80b4a180e70e upstream. Error paths forgot to zero out sensitive material, so this patch changes some kfrees into a kzfrees. Signed-off-by: Jason A. Donenfeld Signed-off-by: David Howells Reviewed-by: Eric Biggers Cc: Herbert Xu Cc: Kirill Marinushkin Cc: security@kernel.org Signed-off-by: Greg Kroah-Hartman --- security/keys/big_key.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 835c1ab30d01..1c93c077d119 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -194,7 +194,7 @@ int big_key_preparse(struct key_preparsed_payload *prep) *path = file->f_path; path_get(path); fput(file); - kfree(data); + kzfree(data); } else { /* Just store the data in a buffer */ void *data = kmalloc(datalen, GFP_KERNEL); @@ -210,9 +210,9 @@ int big_key_preparse(struct key_preparsed_payload *prep) err_fput: fput(file); err_enckey: - kfree(enckey); + kzfree(enckey); error: - kfree(data); + kzfree(data); return ret; } @@ -226,7 +226,7 @@ void big_key_free_preparse(struct key_preparsed_payload *prep) path_put(path); } - kfree(prep->payload.data[big_key_data]); + kzfree(prep->payload.data[big_key_data]); } /* @@ -258,7 +258,7 @@ void big_key_destroy(struct key *key) path->mnt = NULL; path->dentry = NULL; } - kfree(key->payload.data[big_key_data]); + kzfree(key->payload.data[big_key_data]); key->payload.data[big_key_data] = NULL; } @@ -326,7 +326,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen) err_fput: fput(file); error: - kfree(data); + kzfree(data); } else { ret = datalen; if (copy_to_user(buffer, key->payload.data[big_key_data], From 0c70fb88c7510784b12567e9ca5b848dd2b20395 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 2 Oct 2017 12:52:56 +0200 Subject: [PATCH 102/555] security/keys: rewrite all of big_key crypto commit 428490e38b2e352812e0b765d8bceafab0ec441d upstream. This started out as just replacing the use of crypto/rng with get_random_bytes_wait, so that we wouldn't use bad randomness at boot time. But, upon looking further, it appears that there were even deeper underlying cryptographic problems, and that this seems to have been committed with very little crypto review. So, I rewrote the whole thing, trying to keep to the conventions introduced by the previous author, to fix these cryptographic flaws. It makes no sense to seed crypto/rng at boot time and then keep using it like this, when in fact there's already get_random_bytes_wait, which can ensure there's enough entropy and be a much more standard way of generating keys. Since this sensitive material is being stored untrusted, using ECB and no authentication is simply not okay at all. I find it surprising and a bit horrifying that this code even made it past basic crypto review, which perhaps points to some larger issues. This patch moves from using AES-ECB to using AES-GCM. Since keys are uniquely generated each time, we can set the nonce to zero. There was also a race condition in which the same key would be reused at the same time in different threads. A mutex fixes this issue now. So, to summarize, this commit fixes the following vulnerabilities: * Low entropy key generation, allowing an attacker to potentially guess or predict keys. * Unauthenticated encryption, allowing an attacker to modify the cipher text in particular ways in order to manipulate the plaintext, which is is even more frightening considering the next point. * Use of ECB mode, allowing an attacker to trivially swap blocks or compare identical plaintext blocks. * Key re-use. * Faulty memory zeroing. [Note that in backporting this commit to 4.9, get_random_bytes_wait was replaced with get_random_bytes, since 4.9 does not have the former function. This might result in slightly worse entropy in key generation, but common use cases of big_keys makes that likely not a huge deal. And, this is the best we can do with this old kernel. Alas.] Signed-off-by: Jason A. Donenfeld Reviewed-by: Eric Biggers Signed-off-by: David Howells Cc: Herbert Xu Cc: Kirill Marinushkin Cc: security@kernel.org Signed-off-by: Greg Kroah-Hartman --- security/keys/Kconfig | 4 +- security/keys/big_key.c | 124 ++++++++++++++++++---------------------- 2 files changed, 58 insertions(+), 70 deletions(-) diff --git a/security/keys/Kconfig b/security/keys/Kconfig index d942c7c2bc0a..e0a39781b10f 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -41,10 +41,8 @@ config BIG_KEYS bool "Large payload keys" depends on KEYS depends on TMPFS - depends on (CRYPTO_ANSI_CPRNG = y || CRYPTO_DRBG = y) select CRYPTO_AES - select CRYPTO_ECB - select CRYPTO_RNG + select CRYPTO_GCM help This option provides support for holding large keys within the kernel (for example Kerberos ticket caches). The data may be stored out to diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 1c93c077d119..47c6dcab1a8e 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -1,5 +1,6 @@ /* Large capacity key type * + * Copyright (C) 2017 Jason A. Donenfeld . All Rights Reserved. * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * @@ -16,10 +17,10 @@ #include #include #include +#include #include #include -#include -#include +#include /* * Layout of key payload words. @@ -49,7 +50,12 @@ enum big_key_op { /* * Key size for big_key data encryption */ -#define ENC_KEY_SIZE 16 +#define ENC_KEY_SIZE 32 + +/* + * Authentication tag length + */ +#define ENC_AUTHTAG_SIZE 16 /* * big_key defined keys take an arbitrary string as the description and an @@ -64,57 +70,62 @@ struct key_type key_type_big_key = { .destroy = big_key_destroy, .describe = big_key_describe, .read = big_key_read, + /* no ->update(); don't add it without changing big_key_crypt() nonce */ }; /* - * Crypto names for big_key data encryption + * Crypto names for big_key data authenticated encryption */ -static const char big_key_rng_name[] = "stdrng"; -static const char big_key_alg_name[] = "ecb(aes)"; +static const char big_key_alg_name[] = "gcm(aes)"; /* - * Crypto algorithms for big_key data encryption + * Crypto algorithms for big_key data authenticated encryption */ -static struct crypto_rng *big_key_rng; -static struct crypto_skcipher *big_key_skcipher; +static struct crypto_aead *big_key_aead; /* - * Generate random key to encrypt big_key data + * Since changing the key affects the entire object, we need a mutex. */ -static inline int big_key_gen_enckey(u8 *key) -{ - return crypto_rng_get_bytes(big_key_rng, key, ENC_KEY_SIZE); -} +static DEFINE_MUTEX(big_key_aead_lock); /* * Encrypt/decrypt big_key data */ static int big_key_crypt(enum big_key_op op, u8 *data, size_t datalen, u8 *key) { - int ret = -EINVAL; + int ret; struct scatterlist sgio; - SKCIPHER_REQUEST_ON_STACK(req, big_key_skcipher); + struct aead_request *aead_req; + /* We always use a zero nonce. The reason we can get away with this is + * because we're using a different randomly generated key for every + * different encryption. Notably, too, key_type_big_key doesn't define + * an .update function, so there's no chance we'll wind up reusing the + * key to encrypt updated data. Simply put: one key, one encryption. + */ + u8 zero_nonce[crypto_aead_ivsize(big_key_aead)]; - if (crypto_skcipher_setkey(big_key_skcipher, key, ENC_KEY_SIZE)) { + aead_req = aead_request_alloc(big_key_aead, GFP_KERNEL); + if (!aead_req) + return -ENOMEM; + + memset(zero_nonce, 0, sizeof(zero_nonce)); + sg_init_one(&sgio, data, datalen + (op == BIG_KEY_ENC ? ENC_AUTHTAG_SIZE : 0)); + aead_request_set_crypt(aead_req, &sgio, &sgio, datalen, zero_nonce); + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + aead_request_set_ad(aead_req, 0); + + mutex_lock(&big_key_aead_lock); + if (crypto_aead_setkey(big_key_aead, key, ENC_KEY_SIZE)) { ret = -EAGAIN; goto error; } - - skcipher_request_set_tfm(req, big_key_skcipher); - skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, - NULL, NULL); - - sg_init_one(&sgio, data, datalen); - skcipher_request_set_crypt(req, &sgio, &sgio, datalen, NULL); - if (op == BIG_KEY_ENC) - ret = crypto_skcipher_encrypt(req); + ret = crypto_aead_encrypt(aead_req); else - ret = crypto_skcipher_decrypt(req); - - skcipher_request_zero(req); - + ret = crypto_aead_decrypt(aead_req); error: + mutex_unlock(&big_key_aead_lock); + aead_request_free(aead_req); return ret; } @@ -146,15 +157,13 @@ int big_key_preparse(struct key_preparsed_payload *prep) * * File content is stored encrypted with randomly generated key. */ - size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher)); + size_t enclen = datalen + ENC_AUTHTAG_SIZE; - /* prepare aligned data to encrypt */ data = kmalloc(enclen, GFP_KERNEL); if (!data) return -ENOMEM; memcpy(data, prep->data, datalen); - memset(data + datalen, 0x00, enclen - datalen); /* generate random key */ enckey = kmalloc(ENC_KEY_SIZE, GFP_KERNEL); @@ -162,13 +171,10 @@ int big_key_preparse(struct key_preparsed_payload *prep) ret = -ENOMEM; goto error; } - - ret = big_key_gen_enckey(enckey); - if (ret) - goto err_enckey; + get_random_bytes(enckey, ENC_KEY_SIZE); /* encrypt aligned data */ - ret = big_key_crypt(BIG_KEY_ENC, data, enclen, enckey); + ret = big_key_crypt(BIG_KEY_ENC, data, datalen, enckey); if (ret) goto err_enckey; @@ -294,7 +300,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen) struct file *file; u8 *data; u8 *enckey = (u8 *)key->payload.data[big_key_data]; - size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher)); + size_t enclen = datalen + ENC_AUTHTAG_SIZE; data = kmalloc(enclen, GFP_KERNEL); if (!data) @@ -342,47 +348,31 @@ error: */ static int __init big_key_init(void) { - struct crypto_skcipher *cipher; - struct crypto_rng *rng; int ret; - rng = crypto_alloc_rng(big_key_rng_name, 0, 0); - if (IS_ERR(rng)) { - pr_err("Can't alloc rng: %ld\n", PTR_ERR(rng)); - return PTR_ERR(rng); - } - - big_key_rng = rng; - - /* seed RNG */ - ret = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng)); - if (ret) { - pr_err("Can't reset rng: %d\n", ret); - goto error_rng; - } - /* init block cipher */ - cipher = crypto_alloc_skcipher(big_key_alg_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(cipher)) { - ret = PTR_ERR(cipher); + big_key_aead = crypto_alloc_aead(big_key_alg_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(big_key_aead)) { + ret = PTR_ERR(big_key_aead); pr_err("Can't alloc crypto: %d\n", ret); - goto error_rng; + return ret; + } + ret = crypto_aead_setauthsize(big_key_aead, ENC_AUTHTAG_SIZE); + if (ret < 0) { + pr_err("Can't set crypto auth tag len: %d\n", ret); + goto free_aead; } - - big_key_skcipher = cipher; ret = register_key_type(&key_type_big_key); if (ret < 0) { pr_err("Can't register type: %d\n", ret); - goto error_cipher; + goto free_aead; } return 0; -error_cipher: - crypto_free_skcipher(big_key_skcipher); -error_rng: - crypto_free_rng(big_key_rng); +free_aead: + crypto_free_aead(big_key_aead); return ret; } From 47e8bd1965fc2bcee69a62a5cc2d5336b2e79835 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:45 -0700 Subject: [PATCH 103/555] KEYS: fix writing past end of user-supplied buffer in keyring_read() commit e645016abc803dafc75e4b8f6e4118f088900ffb upstream. Userspace can call keyctl_read() on a keyring to get the list of IDs of keys in the keyring. But if the user-supplied buffer is too small, the kernel would write the full list anyway --- which will corrupt whatever userspace memory happened to be past the end of the buffer. Fix it by only filling the space that is available. Fixes: b2a4df200d57 ("KEYS: Expand the capacity of a keyring") Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- security/keys/keyring.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/security/keys/keyring.c b/security/keys/keyring.c index c91e4e0cea08..73bf35e9e378 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -416,7 +416,7 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m) } struct keyring_read_iterator_context { - size_t qty; + size_t buflen; size_t count; key_serial_t __user *buffer; }; @@ -428,9 +428,9 @@ static int keyring_read_iterator(const void *object, void *data) int ret; kenter("{%s,%d},,{%zu/%zu}", - key->type->name, key->serial, ctx->count, ctx->qty); + key->type->name, key->serial, ctx->count, ctx->buflen); - if (ctx->count >= ctx->qty) + if (ctx->count >= ctx->buflen) return 1; ret = put_user(key->serial, ctx->buffer); @@ -465,16 +465,12 @@ static long keyring_read(const struct key *keyring, return 0; /* Calculate how much data we could return */ - ctx.qty = nr_keys * sizeof(key_serial_t); - if (!buffer || !buflen) - return ctx.qty; - - if (buflen > ctx.qty) - ctx.qty = buflen; + return nr_keys * sizeof(key_serial_t); /* Copy the IDs of the subscribed keys into the buffer */ ctx.buffer = (key_serial_t __user *)buffer; + ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { From bfe9d7b8e0f2d4a4bc8298e25597983ac662dac0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:03 -0700 Subject: [PATCH 104/555] KEYS: prevent creating a different user's keyrings commit 237bbd29f7a049d310d907f4b2716a7feef9abf3 upstream. It was possible for an unprivileged user to create the user and user session keyrings for another user. For example: sudo -u '#3000' sh -c 'keyctl add keyring _uid.4000 "" @u keyctl add keyring _uid_ses.4000 "" @u sleep 15' & sleep 1 sudo -u '#4000' keyctl describe @u sudo -u '#4000' keyctl describe @us This is problematic because these "fake" keyrings won't have the right permissions. In particular, the user who created them first will own them and will have full access to them via the possessor permissions, which can be used to compromise the security of a user's keys: -4: alswrv-----v------------ 3000 0 keyring: _uid.4000 -5: alswrv-----v------------ 3000 0 keyring: _uid_ses.4000 Fix it by marking user and user session keyrings with a flag KEY_FLAG_UID_KEYRING. Then, when searching for a user or user session keyring by name, skip all keyrings that don't have the flag set. Fixes: 69664cf16af4 ("keys: don't generate user and user session keyrings unless they're accessed") Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- include/linux/key.h | 2 ++ security/keys/internal.h | 2 +- security/keys/key.c | 2 ++ security/keys/keyring.c | 23 ++++++++++++++--------- security/keys/process_keys.c | 6 ++++-- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/linux/key.h b/include/linux/key.h index 722914798f37..6a544726903e 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -176,6 +176,7 @@ struct key { #define KEY_FLAG_BUILTIN 8 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 10 /* set if key should not be removed */ +#define KEY_FLAG_UID_KEYRING 11 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -235,6 +236,7 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ +#define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); diff --git a/security/keys/internal.h b/security/keys/internal.h index a705a7d92ad7..fb0c65049c19 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -137,7 +137,7 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx); -extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); +extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int install_user_keyrings(void); extern int install_thread_keyring_to_cred(struct cred *); diff --git a/security/keys/key.c b/security/keys/key.c index 2f4ce35ae2aa..135e1eb7e468 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -301,6 +301,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_BUILT_IN) key->flags |= 1 << KEY_FLAG_BUILTIN; + if (flags & KEY_ALLOC_UID_KEYRING) + key->flags |= 1 << KEY_FLAG_UID_KEYRING; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 73bf35e9e378..a86d0ae1773c 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -985,15 +985,15 @@ found: /* * Find a keyring with the specified name. * - * All named keyrings in the current user namespace are searched, provided they - * grant Search permission directly to the caller (unless this check is - * skipped). Keyrings whose usage points have reached zero or who have been - * revoked are skipped. + * Only keyrings that have nonzero refcount, are not revoked, and are owned by a + * user in the current user namespace are considered. If @uid_keyring is %true, + * the keyring additionally must have been allocated as a user or user session + * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ -struct key *find_keyring_by_name(const char *name, bool skip_perm_check) +struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct key *keyring; int bucket; @@ -1021,10 +1021,15 @@ struct key *find_keyring_by_name(const char *name, bool skip_perm_check) if (strcmp(keyring->description, name) != 0) continue; - if (!skip_perm_check && - key_permission(make_key_ref(keyring, 0), - KEY_NEED_SEARCH) < 0) - continue; + if (uid_keyring) { + if (!test_bit(KEY_FLAG_UID_KEYRING, + &keyring->flags)) + continue; + } else { + if (key_permission(make_key_ref(keyring, 0), + KEY_NEED_SEARCH) < 0) + continue; + } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 45536c677b05..ce45c78cf0a2 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -76,7 +76,8 @@ int install_user_keyrings(void) if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); @@ -93,7 +94,8 @@ int install_user_keyrings(void) session_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); From dda70d28c0ac191f128bfd3acfd800667ed86bdf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:23 -0700 Subject: [PATCH 105/555] KEYS: prevent KEYCTL_READ on negative key commit 37863c43b2c6464f252862bf2e9768264e961678 upstream. Because keyctl_read_key() looks up the key with no permissions requested, it may find a negatively instantiated key. If the key is also possessed, we went ahead and called ->read() on the key. But the key payload will actually contain the ->reject_error rather than the normal payload. Thus, the kernel oopses trying to read the user_key_payload from memory address (int)-ENOKEY = 0x00000000ffffff82. Fortunately the payload data is stored inline, so it shouldn't be possible to abuse this as an arbitrary memory read primitive... Reproducer: keyctl new_session keyctl request2 user desc '' @s keyctl read $(keyctl show | awk '/user: desc/ {print $1}') It causes a crash like the following: BUG: unable to handle kernel paging request at 00000000ffffff92 IP: user_read+0x33/0xa0 PGD 36a54067 P4D 36a54067 PUD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 211 Comm: keyctl Not tainted 4.14.0-rc1 #337 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 task: ffff90aa3b74c3c0 task.stack: ffff9878c0478000 RIP: 0010:user_read+0x33/0xa0 RSP: 0018:ffff9878c047bee8 EFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff90aa3d7da340 RCX: 0000000000000017 RDX: 0000000000000000 RSI: 00000000ffffff82 RDI: ffff90aa3d7da340 RBP: ffff9878c047bf00 R08: 00000024f95da94f R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f58ece69740(0000) GS:ffff90aa3e200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000ffffff92 CR3: 0000000036adc001 CR4: 00000000003606f0 Call Trace: keyctl_read_key+0xac/0xe0 SyS_keyctl+0x99/0x120 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x7f58ec787bb9 RSP: 002b:00007ffc8d401678 EFLAGS: 00000206 ORIG_RAX: 00000000000000fa RAX: ffffffffffffffda RBX: 00007ffc8d402800 RCX: 00007f58ec787bb9 RDX: 0000000000000000 RSI: 00000000174a63ac RDI: 000000000000000b RBP: 0000000000000004 R08: 00007ffc8d402809 R09: 0000000000000020 R10: 0000000000000000 R11: 0000000000000206 R12: 00007ffc8d402800 R13: 00007ffc8d4016e0 R14: 0000000000000000 R15: 0000000000000000 Code: e5 41 55 49 89 f5 41 54 49 89 d4 53 48 89 fb e8 a4 b4 ad ff 85 c0 74 09 80 3d b9 4c 96 00 00 74 43 48 8b b3 20 01 00 00 4d 85 ed <0f> b7 5e 10 74 29 4d 85 e4 74 24 4c 39 e3 4c 89 e2 4c 89 ef 48 RIP: user_read+0x33/0xa0 RSP: ffff9878c047bee8 CR2: 00000000ffffff92 Fixes: 61ea0c0ba904 ("KEYS: Skip key state checks when checking for possession") Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- security/keys/keyctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index ada12c3e3ac4..1302cb398346 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -766,6 +766,11 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) key = key_ref_to_ptr(key_ref); + if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { + ret = -ENOKEY; + goto error2; + } + /* see if we can read it directly */ ret = key_permission(key_ref, KEY_NEED_READ); if (ret == 0) From 5c23dcf86e2d66f40d39e1cefb9f8c2eabf83543 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Sep 2017 17:02:52 -0400 Subject: [PATCH 106/555] powerpc/pseries: Fix parent_dn reference leak in add_dt_node() commit b537ca6fede69a281dc524983e5e633d79a10a08 upstream. A reference to the parent device node is held by add_dt_node() for the node to be added. If the call to dlpar_configure_connector() fails add_dt_node() returns ENOENT and that reference is not freed. Add a call to of_node_put(parent_dn) prior to bailing out after a failed dlpar_configure_connector() call. Fixes: 8d5ff320766f ("powerpc/pseries: Make dlpar_configure_connector parent node aware") Signed-off-by: Tyrel Datwyler Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/mobility.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index a560a98bcf3b..6a5e7467445c 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -225,8 +225,10 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) return -ENOENT; dn = dlpar_configure_connector(drc_index, parent_dn); - if (!dn) + if (!dn) { + of_node_put(parent_dn); return -ENOENT; + } rc = dlpar_attach_node(dn); if (rc) From f89f25b531471a6ba43f0b5658f9359fcf33a285 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 13 Sep 2017 22:13:48 -0400 Subject: [PATCH 107/555] powerpc/tm: Flush TM only if CPU has TM feature commit c1fa0768a8713b135848f78fd43ffc208d8ded70 upstream. Commit cd63f3c ("powerpc/tm: Fix saving of TM SPRs in core dump") added code to access TM SPRs in flush_tmregs_to_thread(). However flush_tmregs_to_thread() does not check if TM feature is available on CPU before trying to access TM SPRs in order to copy live state to thread structures. flush_tmregs_to_thread() is indeed guarded by CONFIG_PPC_TRANSACTIONAL_MEM but it might be the case that kernel was compiled with CONFIG_PPC_TRANSACTIONAL_MEM enabled and ran on a CPU without TM feature available, thus rendering the execution of TM instructions that are treated by the CPU as illegal instructions. The fix is just to add proper checking in flush_tmregs_to_thread() if CPU has the TM feature before accessing any TM-specific resource, returning immediately if TM is no available on the CPU. Adding that checking in flush_tmregs_to_thread() instead of in places where it is called, like in vsr_get() and vsr_set(), is better because avoids the same problem cropping up elsewhere. Fixes: cd63f3c ("powerpc/tm: Fix saving of TM SPRs in core dump") Signed-off-by: Gustavo Romero Reviewed-by: Cyril Bur Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index dcbb9144c16d..d97370866a5f 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -131,7 +131,7 @@ static void flush_tmregs_to_thread(struct task_struct *tsk) * in the appropriate thread structures from live. */ - if (tsk != current) + if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current)) return; if (MSR_TM_SUSPENDED(mfmsr())) { From c76655fb0f448de4cfadc83d3266c05a0a3c5dc0 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 1 Jun 2017 16:18:16 +0530 Subject: [PATCH 108/555] powerpc/ftrace: Pass the correct stack pointer for DYNAMIC_FTRACE_WITH_REGS commit a4979a7e71eb8da976cbe4a0a1fa50636e76b04f upstream. For DYNAMIC_FTRACE_WITH_REGS, we should be passing-in the original set of registers in pt_regs, to capture the state _before_ ftrace_caller. However, we are instead passing the stack pointer *after* allocating a stack frame in ftrace_caller. Fix this by saving the proper value of r1 in pt_regs. Also, use SAVE_10GPRS() to simplify the code. Fixes: 153086644fd1 ("powerpc/ftrace: Add support for -mprofile-kernel ftrace ABI") Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/entry_64.S | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 767ef6d68c9e..caa659671599 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -1235,10 +1235,14 @@ _GLOBAL(ftrace_caller) stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save all gprs to pt_regs */ - SAVE_8GPRS(0,r1) - SAVE_8GPRS(8,r1) - SAVE_8GPRS(16,r1) - SAVE_8GPRS(24,r1) + SAVE_GPR(0, r1) + SAVE_10GPRS(2, r1) + SAVE_10GPRS(12, r1) + SAVE_10GPRS(22, r1) + + /* Save previous stack pointer (r1) */ + addi r8, r1, SWITCH_FRAME_SIZE + std r8, GPR1(r1) /* Load special regs for save below */ mfmsr r8 @@ -1292,10 +1296,10 @@ ftrace_call: #endif /* Restore gprs */ - REST_8GPRS(0,r1) - REST_8GPRS(8,r1) - REST_8GPRS(16,r1) - REST_8GPRS(24,r1) + REST_GPR(0,r1) + REST_10GPRS(2,r1) + REST_10GPRS(12,r1) + REST_10GPRS(22,r1) /* Restore callee's TOC */ ld r2, 24(r1) From 22338c55658d888326bd1998f8d4328c76809053 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 18 Sep 2017 16:51:51 +0200 Subject: [PATCH 109/555] s390/mm: fix write access check in gup_huge_pmd() commit ba385c0594e723d41790ecfb12c610e6f90c7785 upstream. The check for the _SEGMENT_ENTRY_PROTECT bit in gup_huge_pmd() is the wrong way around. It must not be set for write==1, and not be checked for write==0. Fix this similar to how it was fixed for ptes long time ago in commit 25591b070336 ("[S390] fix get_user_pages_fast"). One impact of this bug would be unnecessarily using the gup slow path for write==0 on r/w mappings. A potentially more severe impact would be that gup_huge_pmd() will succeed for write==1 on r/o mappings. Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/mm/gup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 18d4107e10ee..97fc449a7470 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -56,13 +56,12 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - unsigned long mask, result; struct page *head, *page; + unsigned long mask; int refs; - result = write ? 0 : _SEGMENT_ENTRY_PROTECT; - mask = result | _SEGMENT_ENTRY_INVALID; - if ((pmd_val(pmd) & mask) != result) + mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID; + if ((pmd_val(pmd) & mask) != 0) return 0; VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); From 3a02f8cb556402a81028469be9cf17bb3a0542cf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 19 Sep 2017 02:22:39 +0200 Subject: [PATCH 110/555] PM: core: Fix device_pm_check_callbacks() commit 157c460e10cb6eca29ccbd0f023db159d0c55ec7 upstream. The device_pm_check_callbacks() function doesn't check legacy ->suspend and ->resume callback pointers under the device's bus type, class and driver, so in some cases it may set the no_pm_callbacks flag for the device incorrectly and then the callbacks may be skipped during system suspend/resume, which shouldn't happen. Fixes: aa8e54b55947 (PM / sleep: Go direct_complete if driver has no callbacks) Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/base/power/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 2932a5bd892f..dfffba39f723 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1757,10 +1757,13 @@ void device_pm_check_callbacks(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.no_pm_callbacks = - (!dev->bus || pm_ops_is_empty(dev->bus->pm)) && - (!dev->class || pm_ops_is_empty(dev->class->pm)) && + (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && + !dev->bus->suspend && !dev->bus->resume)) && + (!dev->class || (pm_ops_is_empty(dev->class->pm) && + !dev->class->suspend && !dev->class->resume)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && - (!dev->driver || pm_ops_is_empty(dev->driver->pm)); + (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && + !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irq(&dev->power.lock); } From f2d395b7bde53926bbca40e6e091e7fe4a644b4b Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 18 Sep 2017 18:18:45 -0500 Subject: [PATCH 111/555] Fix SMB3.1.1 guest authentication to Samba commit 23586b66d84ba3184b8820277f3fc42761640f87 upstream. Samba rejects SMB3.1.1 dialect (vers=3.1.1) negotiate requests from the kernel client due to the two byte pad at the end of the negotiate contexts. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0437e5fdba56..9d8ad4b285fa 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -366,7 +366,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req) build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); req->NegotiateContextCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ } #else From df1be2066433d6381efe5b092e7f662a4f1dada2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Sep 2017 18:40:03 -0500 Subject: [PATCH 112/555] SMB3: Warn user if trying to sign connection that authenticated as guest commit c721c38957fb19982416f6be71aae7b30630d83b upstream. It can be confusing if user ends up authenticated as guest but they requested signing (server will return error validating signed packets) so add log message for this. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 9d8ad4b285fa..b98d96a41518 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1010,6 +1010,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, while (sess_data->func) sess_data->func(sess_data); + if ((ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) && (ses->sign)) + cifs_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: kfree(sess_data); From 0e1b85a41a25ac888fb64a60ad2949dbc2ab61ed Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Sep 2017 19:57:18 -0500 Subject: [PATCH 113/555] SMB: Validate negotiate (to protect against downgrade) even if signing off commit 0603c96f3af50e2f9299fa410c224ab1d465e0f9 upstream. As long as signing is supported (ie not a guest user connection) and connection is SMB3 or SMB3.02, then validate negotiate (protect against man in the middle downgrade attacks). We had been doing this only when signing was required, not when signing was just enabled, but this more closely matches recommended SMB3 behavior and is better security. Suggested by Metze. Signed-off-by: Steve French Reviewed-by: Jeremy Allison Acked-by: Stefan Metzmacher Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b98d96a41518..69b610ad3fdc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -531,15 +531,22 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) /* * validation ioctl must be signed, so no point sending this if we - * can not sign it. We could eventually change this to selectively + * can not sign it (ie are not known user). Even if signing is not + * required (enabled but not negotiated), in those cases we selectively * sign just this, the first and only signed request on a connection. - * This is good enough for now since a user who wants better security - * would also enable signing on the mount. Having validation of - * negotiate info for signed connections helps reduce attack vectors + * Having validation of negotiate info helps reduce attack vectors. */ - if (tcon->ses->server->sign == false) + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) return 0; /* validation requires signing */ + if (tcon->ses->user_name == NULL) { + cifs_dbg(FYI, "Can't validate negotiate: null user mount\n"); + return 0; /* validation requires signing */ + } + + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) + cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); + vneg_inbuf.Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, From 18a89a10b26b325da5eb03cbd275f835a4a704f5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 22 Sep 2017 01:40:27 -0500 Subject: [PATCH 114/555] SMB3: Don't ignore O_SYNC/O_DSYNC and O_DIRECT flags commit 1013e760d10e614dc10b5624ce9fc41563ba2e65 upstream. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky Signed-off-by: Greg Kroah-Hartman --- fs/cifs/file.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3925758f6dde..cf192f9ce254 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -224,6 +224,13 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; + /* O_SYNC also has bit for O_DSYNC so following check picks up either */ + if (f_flags & O_SYNC) + create_options |= CREATE_WRITE_THROUGH; + + if (f_flags & O_DIRECT) + create_options |= CREATE_NO_BUFFER; + oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = desired_access; From f3e2e7f0b4d77d8b26a2bd5525c35d9a16952517 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 25 Sep 2017 12:23:03 +0200 Subject: [PATCH 115/555] vfs: Return -ENXIO for negative SEEK_HOLE / SEEK_DATA offsets commit fc46820b27a2d9a46f7e90c9ceb4a64a1bc5fab8 upstream. In generic_file_llseek_size, return -ENXIO for negative offsets as well as offsets beyond EOF. This affects filesystems which don't implement SEEK_HOLE / SEEK_DATA internally, possibly because they don't support holes. Fixes xfstest generic/448. Signed-off-by: Andreas Gruenbacher Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/read_write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index e479e24dcd4c..09a8757efd34 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -114,7 +114,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: @@ -122,7 +122,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; From c820441a7a52e3626aede8df94069a50a9e4efdb Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Wed, 13 Sep 2017 00:21:21 +0200 Subject: [PATCH 116/555] nl80211: check for the required netlink attributes presence commit e785fa0a164aa11001cba931367c7f94ffaff888 upstream. nl80211_set_rekey_data() does not check if the required attributes NL80211_REKEY_DATA_{REPLAY_CTR,KEK,KCK} are present when processing NL80211_CMD_SET_REKEY_OFFLOAD request. This request can be issued by users with CAP_NET_ADMIN privilege and may result in NULL dereference and a system crash. Add a check for the required attributes presence. This patch is based on the patch by bo Zhang. This fixes CVE-2017-12153. References: https://bugzilla.redhat.com/show_bug.cgi?id=1491046 Fixes: e5497d766ad ("cfg80211/nl80211: support GTK rekey offload") Reported-by: bo Zhang Signed-off-by: Vladis Dronov Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e9e9bc5c8773..ece0fbc08607 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10385,6 +10385,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || + !tb[NL80211_REKEY_DATA_KCK]) + return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) From eb4375e1969c48d454998b2a284c2e6a5dc9eb68 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Sep 2017 13:54:35 +0200 Subject: [PATCH 117/555] bsg-lib: don't free job in bsg_prepare_job commit f507b54dccfd8000c517d740bc45f20c74532d18 upstream. The job structure is allocated as part of the request, so we should not free it in the error path of bsg_prepare_job. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/bsg-lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 650f427d915b..341b8d858e67 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -147,7 +147,6 @@ static int bsg_create_job(struct device *dev, struct request *req) failjob_rls_rqst_payload: kfree(job->request_payload.sg_list); failjob_rls_job: - kfree(job); return -ENOMEM; } From f184cf5256b704c41acb873575cd2a554bfe4265 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 5 Sep 2017 11:52:34 -0700 Subject: [PATCH 118/555] iw_cxgb4: remove the stid on listen create failure commit 8b1bbf36b7452c4acb20e91948eaa5e225ea6978 upstream. If a listen create fails, then the server tid (stid) is incorrectly left in the stid idr table, which can cause a touch-after-free if the stid is looked up and the already freed endpoint is touched. So make sure and remove it in the error path. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/cxgb4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 9398143d7c5e..e422a5bee855 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3441,7 +3441,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = ep; goto out; } - + remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid); cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); fail2: From 831cca587e7b4bf03996c72a3ebf2d21146c0b44 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 13 Sep 2017 09:52:32 -0700 Subject: [PATCH 119/555] iw_cxgb4: put ep reference in pass_accept_req() commit 3d318605f5e32ff44fb290d9b67573b34213c4c8 upstream. The listening endpoint should always be dereferenced at the end of pass_accept_req(). Fixes: f86fac79afec ("RDMA/iw_cxgb4: atomic find and reference for listening endpoints") Signed-off-by: Steve Wise Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/cxgb4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index e422a5bee855..6512a555f7f8 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2577,9 +2577,9 @@ fail: c4iw_put_ep(&child_ep->com); reject: reject_cr(dev, hwtid, skb); +out: if (parent_ep) c4iw_put_ep(&parent_ep->com); -out: return 0; } From 58052a74d9b0a8e2aaf6e258c94a32f7c2d3aae6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 Sep 2017 16:32:46 -0700 Subject: [PATCH 120/555] selftests/seccomp: Support glibc 2.26 siginfo_t.h commit 10859f3855db4c6f10dc7974ff4b3a292f3de8e0 upstream. The 2.26 release of glibc changed how siginfo_t is defined, and the earlier work-around to using the kernel definition are no longer needed. The old way needs to stay around for a while, though. Reported-by: Seth Forshee Cc: Andy Lutomirski Cc: Will Drewry Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Signed-off-by: Kees Cook Tested-by: Seth Forshee Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/seccomp/seccomp_bpf.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 03f1fa495d74..cbb0564c0ec4 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -6,10 +6,18 @@ */ #include -#include -#define __have_siginfo_t 1 -#define __have_sigval_t 1 -#define __have_sigevent_t 1 + +/* + * glibc 2.26 and later have SIGSYS in siginfo_t. Before that, + * we need to use the kernel's siginfo.h file and trick glibc + * into accepting it. + */ +#if !__GLIBC_PREREQ(2, 26) +# include +# define __have_siginfo_t 1 +# define __have_sigval_t 1 +# define __have_sigevent_t 1 +#endif #include #include @@ -676,7 +684,7 @@ TEST_F_SIGNAL(TRAP, ign, SIGSYS) syscall(__NR_getpid); } -static struct siginfo TRAP_info; +static siginfo_t TRAP_info; static volatile int TRAP_nr; static void TRAP_action(int nr, siginfo_t *info, void *void_context) { From be69c4c00a68210e6ca5eb669b6e8d7e1ac00cb8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Sep 2017 09:25:30 -0600 Subject: [PATCH 121/555] seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter() commit 66a733ea6b611aecf0119514d2dddab5f9d6c01e upstream. As Chris explains, get_seccomp_filter() and put_seccomp_filter() can end up using different filters. Once we drop ->siglock it is possible for task->seccomp.filter to have been replaced by SECCOMP_FILTER_FLAG_TSYNC. Fixes: f8e529ed941b ("seccomp, ptrace: add support for dumping seccomp filters") Reported-by: Chris Salls Signed-off-by: Oleg Nesterov [tycho: add __get_seccomp_filter vs. open coding refcount_inc()] Signed-off-by: Tycho Andersen [kees: tweak commit log] Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- kernel/seccomp.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0db7c8a2afe2..af182a6df25b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -457,14 +457,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + atomic_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - atomic_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -475,10 +480,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -487,6 +490,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + /** * seccomp_send_sigsys - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland @@ -892,13 +901,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: From 7dbd64284b18423c7c81ceb6911de472d1bbadcf Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Sep 2017 15:57:16 +0100 Subject: [PATCH 122/555] arm64: Make sure SPsel is always set commit 5371513fb338fb9989c569dc071326d369d6ade8 upstream. When the kernel is entered at EL2 on an ARMv8.0 system, we construct the EL1 pstate and make sure this uses the the EL1 stack pointer (we perform an exception return to EL1h). But if the kernel is either entered at EL1 or stays at EL2 (because we're on a VHE-capable system), we fail to set SPsel, and use whatever stack selection the higher exception level has choosen for us. Let's not take any chance, and make sure that SPsel is set to one before we decide the mode we're going to run in. Acked-by: Mark Rutland Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/head.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 332e33193ccf..539bebc1222f 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -486,6 +486,7 @@ ENTRY(kimage_vaddr) * booted in EL1 or EL2 respectively. */ ENTRY(el2_setup) + msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.ne 1f From d49527ed4888dbfbeb3b74a41343a24e81e5517b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Sep 2017 12:27:41 +0100 Subject: [PATCH 123/555] arm64: fault: Route pte translation faults via do_translation_fault commit 760bfb47c36a07741a089bf6a28e854ffbee7dc9 upstream. We currently route pte translation faults via do_page_fault, which elides the address check against TASK_SIZE before invoking the mm fault handling code. However, this can cause issues with the path walking code in conjunction with our word-at-a-time implementation because load_unaligned_zeropad can end up faulting in kernel space if it reads across a page boundary and runs into a page fault (e.g. by attempting to read from a guard region). In the case of such a fault, load_unaligned_zeropad has registered a fixup to shift the valid data and pad with zeroes, however the abort is reported as a level 3 translation fault and we dispatch it straight to do_page_fault, despite it being a kernel address. This results in calling a sleeping function from atomic context: BUG: sleeping function called from invalid context at arch/arm64/mm/fault.c:313 in_atomic(): 0, irqs_disabled(): 0, pid: 10290 Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [...] [] ___might_sleep+0x134/0x144 [] __might_sleep+0x7c/0x8c [] do_page_fault+0x140/0x330 [] do_mem_abort+0x54/0xb0 Exception stack(0xfffffffb20247a70 to 0xfffffffb20247ba0) [...] [] el1_da+0x18/0x78 [] path_parentat+0x44/0x88 [] filename_parentat+0x5c/0xd8 [] filename_create+0x4c/0x128 [] SyS_mkdirat+0x50/0xc8 [] el0_svc_naked+0x24/0x28 Code: 36380080 d5384100 f9400800 9402566d (d4210000) ---[ end trace 2d01889f2bca9b9f ]--- Fix this by dispatching all translation faults to do_translation_faults, which avoids invoking the page fault logic for faults on kernel addresses. Reported-by: Ankit Jain Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index fec5b1ce97f8..403fe9e57135 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -509,7 +509,7 @@ static const struct fault_info fault_info[] = { { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, - { do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, { do_bad, SIGBUS, 0, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, From 01c58b0edeb1d7cab6976462aef585e928924ab9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:04 +0200 Subject: [PATCH 124/555] KVM: VMX: extract __pi_post_block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit cd39e1176d320157831ce030b4c869bd2d5eb142 upstream. Simple code movement patch, preparing for the next one. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 71 +++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3dc6d8017ce9..98bacab13ea1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11000,6 +11000,43 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + /* * This routine does the following things for vCPU which is going * to be blocked if VT-d PI is enabled. @@ -11093,44 +11130,12 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || !kvm_vcpu_apicv_active(vcpu)) return; - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } + __pi_post_block(vcpu); } static void vmx_post_block(struct kvm_vcpu *vcpu) From ff5eb8f28ff260873909fbd259cf892594621fc4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:05 +0200 Subject: [PATCH 125/555] KVM: VMX: avoid double list add with VT-d posted interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8b306e2f3c41939ea528e6174c88cfbfff893ce1 upstream. In some cases, for example involving hot-unplug of assigned devices, pi_post_block can forget to remove the vCPU from the blocked_vcpu_list. When this happens, the next call to pi_pre_block corrupts the list. Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block and WARN instead of adding the element twice in the list. Second, always do the list removal in pi_post_block if vcpu->pre_pcpu is set (not -1). The new code keeps interrupts disabled for the whole duration of pi_pre_block/pi_post_block. This is not strictly necessary, but easier to follow. For the same reason, PI.ON is checked only after the cmpxchg, and to handle it we just call the post-block code. This removes duplication of the list removal code. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 62 +++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 98bacab13ea1..400ea919332d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11005,10 +11005,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; unsigned int dest; - unsigned long flags; do { old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); dest = cpu_physical_id(vcpu->cpu); @@ -11025,14 +11026,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); vcpu->pre_pcpu = -1; } } @@ -11052,7 +11049,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) */ static int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned long flags; unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -11062,34 +11058,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) !kvm_vcpu_apicv_active(vcpu)) return 0; - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } do { old.control = new.control = pi_desc->control; - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " "is set before blocking\n"); @@ -11114,7 +11096,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); - return 0; + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -11130,12 +11117,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (vcpu->pre_pcpu == -1) return; + WARN_ON(irqs_disabled()); + local_irq_disable(); __pi_post_block(vcpu); + local_irq_enable(); } static void vmx_post_block(struct kvm_vcpu *vcpu) From 58d2fb119ae61fe5ae27586c98ce3664127c6177 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:06 +0200 Subject: [PATCH 126/555] KVM: VMX: simplify and fix vmx_vcpu_pi_load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a upstream. The simplify part: do not touch pi_desc.nv, we can set it when the VCPU is first created. Likewise, pi_desc.sn is only handled by vmx_vcpu_pi_load, do not touch it in __pi_post_block. The fix part: do not check kvm_arch_has_assigned_device, instead check the SN bit to figure out whether vmx_vcpu_pi_put ran before. This matches what the previous patch did in pi_post_block. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 400ea919332d..489e4a0c8b3f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2167,43 +2167,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) struct pi_desc old, new; unsigned int dest; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + return; + } + + /* The full case. */ do { old.control = new.control = pi_desc->control; - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); + dest = cpu_physical_id(cpu); - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } - - /* Allow posting non-urgent interrupts */ new.sn = 0; } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); @@ -9187,6 +9185,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + return &vmx->vcpu; free_vmcs: @@ -11018,9 +11023,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) else new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ - new.sn = 0; - /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; } while (cmpxchg(&pi_desc->control, old.control, From e3a643b3288af043f253cfd7e5bf5a4889964d7c Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 29 Sep 2017 19:01:45 +0800 Subject: [PATCH 127/555] kvm/x86: Handle async PF in RCU read-side critical sections commit b862789aa5186d5ea3a024b7cfe0f80c3a38b980 upstream. Sasha Levin reported a WARNING: | WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 | rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] | WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 | rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 ... | CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS | 1.10.1-1ubuntu1 04/01/2014 | Call Trace: ... | RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] | RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 | RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002 | RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000 | RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410 | RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001 | R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08 | R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040 | __schedule+0x201/0x2240 kernel/sched/core.c:3292 | schedule+0x113/0x460 kernel/sched/core.c:3421 | kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158 | do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271 | async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069 | RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996 | RSP: 0018:ffff88003b2df520 EFLAGS: 00010283 | RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670 | RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140 | RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000 | R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8 | R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000 | vsnprintf+0x173/0x1700 lib/vsprintf.c:2136 | sprintf+0xbe/0xf0 lib/vsprintf.c:2386 | proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23 | get_link fs/namei.c:1047 [inline] | link_path_walk+0x1041/0x1490 fs/namei.c:2127 ... This happened when the host hit a page fault, and delivered it as in an async page fault, while the guest was in an RCU read-side critical section. The guest then tries to reschedule in kvm_async_pf_task_wait(), but rcu_preempt_note_context_switch() would treat the reschedule as a sleep in RCU read-side critical section, which is not allowed (even in preemptible RCU). Thus the WARN. To cure this, make kvm_async_pf_task_wait() go to the halt path if the PF happens in a RCU read-side critical section. Reported-by: Sasha Levin Cc: "Paul E. McKenney" Cc: Peter Zijlstra Signed-off-by: Boqun Feng Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 55ffd9dc2258..77f17cbfe271 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -141,7 +141,8 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1 || + rcu_preempt_depth(); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); From 3d4213fac7d10e72859112c9100d8015ce442a3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Thu, 7 Sep 2017 19:02:30 +0100 Subject: [PATCH 128/555] KVM: VMX: Do not BUG() on out-of-bounds guest IRQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb upstream. The value of the guest_irq argument to vmx_update_pi_irte() is ultimately coming from a KVM_IRQFD API call. Do not BUG() in vmx_update_pi_irte() if the value is out-of bounds. (Especially, since KVM as a whole seems to hang after that.) Instead, print a message only once if we find that we don't have a route for a certain IRQ (which can be out-of-bounds or within the array). This fixes CVE-2017-1000252. Fixes: efc644048ecde54 ("KVM: x86: Update IRTE for posted-interrupts") Signed-off-by: Jan H. Schönherr Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 489e4a0c8b3f..53862744c0db 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11153,7 +11153,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || @@ -11162,7 +11162,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) From 86ef97b2dfd504fbc65f6b244a422db0c1b15797 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 12 Sep 2017 13:02:54 -0700 Subject: [PATCH 129/555] kvm: nVMX: Don't allow L2 to access the hardware CR8 commit 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f upstream. If L1 does not specify the "use TPR shadow" VM-execution control in vmcs12, then L0 must specify the "CR8-load exiting" and "CR8-store exiting" VM-execution controls in vmcs02. Failure to do so will give the L2 VM unrestricted read/write access to the hardware CR8. This fixes CVE-2017-12154. Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 53862744c0db..a29f5452f83d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10001,6 +10001,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(vmx->nested.virtual_apic_page)); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif } if (cpu_has_vmx_msr_bitmap() && From 02c7d98bec6cf34ee7079fa503faffc82e17f575 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 18 Sep 2017 14:46:03 -0700 Subject: [PATCH 130/555] xfs: validate bdev support for DAX inode flag commit 6851a3db7e224bbb85e23b3c64a506c9e0904382 upstream. Currently only the blocksize is checked, but we should really be calling bdev_dax_supported() which also tests to make sure we can get a struct dax_device and that the dax_direct_access() path is working. This is the same check that we do for the "-o dax" mount option in xfs_fs_fill_super(). This does not fix the race issues that caused the XFS DAX inode option to be disabled, so that option will still be disabled. If/when we re-enable it, though, I think we will want this issue to have been fixed. I also do think that we want to fix this in stable kernels. Signed-off-by: Ross Zwisler Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bce2e260f55e..6c95812120eb 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1085,6 +1085,7 @@ xfs_ioctl_setattr_dax_invalidate( int *join_flags) { struct inode *inode = VFS_I(ip); + struct super_block *sb = inode->i_sb; int error; *join_flags = 0; @@ -1097,7 +1098,7 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) + if (bdev_dax_supported(sb, sb->s_blocksize) < 0) return -EINVAL; } From 46f062e05920a4d40dbe32aa4bc622df33db6fb2 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 11 Sep 2017 15:29:31 +0200 Subject: [PATCH 131/555] etnaviv: fix gem object list corruption commit 518417525f3652c12fb5fad6da4ade66c0072fa3 upstream. All manipulations of the gem_object list need to be protected by the list mutex, as GEM objects can be created and freed in parallel. This fixes a kernel memory corruption. Signed-off-by: Lucas Stach Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 0370b842d9cc..82dd57d4843c 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -549,12 +549,15 @@ static const struct etnaviv_gem_ops etnaviv_gem_shmem_ops = { void etnaviv_gem_free_object(struct drm_gem_object *obj) { struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj); + struct etnaviv_drm_private *priv = obj->dev->dev_private; struct etnaviv_vram_mapping *mapping, *tmp; /* object should not be active */ WARN_ON(is_active(etnaviv_obj)); + mutex_lock(&priv->gem_lock); list_del(&etnaviv_obj->gem_node); + mutex_unlock(&priv->gem_lock); list_for_each_entry_safe(mapping, tmp, &etnaviv_obj->vram_list, obj_node) { From bb1e06d281a82c75487fb7ddf25e540b82db4306 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:40 +0200 Subject: [PATCH 132/555] PCI: Fix race condition with driver_override commit 9561475db680f7144d2223a409dd3d7e322aca03 upstream. The driver_override implementation is susceptible to a race condition when different threads are reading vs. storing a different driver override. Add locking to avoid the race condition. This is in close analogy to commit 6265539776a0 ("driver core: platform: fix race condition with driver_override") from Adrian Salido. Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override") Signed-off-by: Nicolai Stange Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-sysfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 1b0786555394..f9f4d1c18eb2 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -527,7 +527,7 @@ static ssize_t driver_override_store(struct device *dev, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - char *driver_override, *old = pdev->driver_override, *cp; + char *driver_override, *old, *cp; /* We need to keep extra room for a newline */ if (count >= (PAGE_SIZE - 1)) @@ -541,12 +541,15 @@ static ssize_t driver_override_store(struct device *dev, if (cp) *cp = '\0'; + device_lock(dev); + old = pdev->driver_override; if (strlen(driver_override)) { pdev->driver_override = driver_override; } else { kfree(driver_override); pdev->driver_override = NULL; } + device_unlock(dev); kfree(old); @@ -557,8 +560,12 @@ static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); + ssize_t len; - return snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_lock(dev); + len = snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_unlock(dev); + return len; } static DEVICE_ATTR_RW(driver_override); From b86b6c226beafc28d5935ebb99590348cb48b633 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 25 Aug 2017 14:15:14 +0900 Subject: [PATCH 133/555] btrfs: fix NULL pointer dereference from free_reloc_roots() commit bb166d7207432d3c7d10c45dc052f12ba3a2121d upstream. __del_reloc_root should be called before freeing up reloc_root->node. If not, calling __del_reloc_root() dereference reloc_root->node, causing the system BUG. Fixes: 6bdf131fac23 ("Btrfs: don't leak reloc root nodes on error") Signed-off-by: Naohiro Aota Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2cf5e142675e..04c61bcf62e5 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2367,11 +2367,11 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->node = NULL; reloc_root->commit_root = NULL; - __del_reloc_root(reloc_root); } } From ba44bc49bae6e9e25630388fefbbaa6c6bdd0a11 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 8 Sep 2017 17:48:55 +0900 Subject: [PATCH 134/555] btrfs: propagate error to btrfs_cmp_data_prepare caller commit 78ad4ce014d025f41b8dde3a81876832ead643cf upstream. btrfs_cmp_data_prepare() (almost) always returns 0 i.e. ignoring errors from gather_extent_pages(). While the pages are freed by btrfs_cmp_data_free(), cmp->num_pages still has > 0. Then, btrfs_extent_same() try to access the already freed pages causing faults (or violates PageLocked assertion). This patch just return the error as is so that the caller stop the process. Signed-off-by: Naohiro Aota Fixes: f441460202cb ("btrfs: fix deadlock with extent-same and readpage") Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1782804f6c26..90185e94d79d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3052,7 +3052,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, out: if (ret) btrfs_cmp_data_free(cmp); - return 0; + return ret; } static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, From f11525d7ff5d784270a66c8b888705dd9b96620b Mon Sep 17 00:00:00 2001 From: satoru takeuchi Date: Tue, 12 Sep 2017 22:42:52 +0900 Subject: [PATCH 135/555] btrfs: prevent to set invalid default subvolid commit 6d6d282932d1a609e60dc4467677e0e863682f57 upstream. `btrfs sub set-default` succeeds to set an ID which isn't corresponding to any fs/file tree. If such the bad ID is set to a filesystem, we can't mount this filesystem without specifying `subvol` or `subvolid` mount options. Fixes: 6ef5ed0d386b ("Btrfs: add ioctl and incompat flag to set the default mount subvol") Signed-off-by: Satoru Takeuchi Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 90185e94d79d..0fe346c4bd28 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4082,6 +4082,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } + if (!is_fstree(new_root->objectid)) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); if (!path) { From 54af98f86b925573bfaf5254bab50e4a01df1526 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 4 Sep 2017 10:32:15 +0200 Subject: [PATCH 136/555] x86/mm: Fix fault error path using unsafe vma pointer commit a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 upstream. commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") passes down a vma pointer to the error path, but that is done once the mmap_sem is released when calling mm_fault_error() from __do_page_fault(). This is dangerous as the vma structure is no more safe to be used once the mmap_sem has been released. As only the protection key value is required in the error processing, we could just pass down this value. Fix it by passing a pointer to a protection key value down to the fault signal generation code. The use of a pointer allows to keep the check generating a warning message in fill_sig_info_pkey() when the vma was not known. If the pointer is valid, the protection value can be accessed by deferencing the pointer. [ tglx: Made *pkey u32 as that's the type which is passed in siginfo ] Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") Signed-off-by: Laurent Dufour Signed-off-by: Thomas Gleixner Cc: linux-mm@kvack.org Cc: Dave Hansen Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/fault.c | 47 +++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9f72ca3b2669..1dd796025472 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,8 +191,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -208,7 +207,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -218,13 +217,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -239,7 +237,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -718,8 +716,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -744,7 +740,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -853,8 +849,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -902,7 +897,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -915,9 +910,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -925,6 +920,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -932,7 +931,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -975,7 +975,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1002,13 +1002,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1032,9 +1031,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1220,6 +1219,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1420,9 +1420,10 @@ good_area: return; } + pkey = vma_pkey(vma); up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } From 5e9b07f30d21295b83f2024ffb5a349d3af6f749 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 2 Oct 2017 11:01:40 -0700 Subject: [PATCH 137/555] x86/fpu: Don't let userspace set bogus xcomp_bv commit 814fb7bb7db5433757d76f4c4502c96fc53b0b5e upstream. On x86, userspace can use the ptrace() or rt_sigreturn() system calls to set a task's extended state (xstate) or "FPU" registers. ptrace() can set them for another task using the PTRACE_SETREGSET request with NT_X86_XSTATE, while rt_sigreturn() can set them for the current task. In either case, registers can be set to any value, but the kernel assumes that the XSAVE area itself remains valid in the sense that the CPU can restore it. However, in the case where the kernel is using the uncompacted xstate format (which it does whenever the XSAVES instruction is unavailable), it was possible for userspace to set the xcomp_bv field in the xstate_header to an arbitrary value. However, all bits in that field are reserved in the uncompacted case, so when switching to a task with nonzero xcomp_bv, the XRSTOR instruction failed with a #GP fault. This caused the WARN_ON_FPU(err) in copy_kernel_to_xregs() to be hit. In addition, since the error is otherwise ignored, the FPU registers from the task previously executing on the CPU were leaked. Fix the bug by checking that the user-supplied value of xcomp_bv is 0 in the uncompacted case, and returning an error otherwise. The reason for validating xcomp_bv rather than simply overwriting it with 0 is that we want userspace to see an error if it (incorrectly) provides an XSAVE area in compacted format rather than in uncompacted format. Note that as before, in case of error we clear the task's FPU state. This is perhaps non-ideal, especially for PTRACE_SETREGSET; it might be better to return an error before changing anything. But it seems the "clear on error" behavior is fine for now, and it's a little tricky to do otherwise because it would mean we couldn't simply copy the full userspace state into kernel memory in one __copy_from_user(). This bug was found by syzkaller, which hit the above-mentioned WARN_ON_FPU(): WARNING: CPU: 1 PID: 0 at ./arch/x86/include/asm/fpu/internal.h:373 __switch_to+0x5b5/0x5d0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.13.0 #453 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff9ba2bc8e42c0 task.stack: ffffa78cc036c000 RIP: 0010:__switch_to+0x5b5/0x5d0 RSP: 0000:ffffa78cc08bbb88 EFLAGS: 00010082 RAX: 00000000fffffffe RBX: ffff9ba2b8bf2180 RCX: 00000000c0000100 RDX: 00000000ffffffff RSI: 000000005cb10700 RDI: ffff9ba2b8bf36c0 RBP: ffffa78cc08bbbd0 R08: 00000000929fdf46 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9ba2bc8e42c0 R13: 0000000000000000 R14: ffff9ba2b8bf3680 R15: ffff9ba2bf5d7b40 FS: 00007f7e5cb10700(0000) GS:ffff9ba2bf400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004005cc CR3: 0000000079fd5000 CR4: 00000000001406e0 Call Trace: Code: 84 00 00 00 00 00 e9 11 fd ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 e7 fa ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 c2 fa ff ff <0f> ff 66 0f 1f 84 00 00 00 00 00 e9 d4 fc ff ff 66 66 2e 0f 1f Here is a C reproducer. The expected behavior is that the program spin forever with no output. However, on a buggy kernel running on a processor with the "xsave" feature but without the "xsaves" feature (e.g. Sandy Bridge through Broadwell for Intel), within a second or two the program reports that the xmm registers were corrupted, i.e. were not restored correctly. With CONFIG_X86_DEBUG_FPU=y it also hits the above kernel warning. #define _GNU_SOURCE #include #include #include #include #include #include #include #include int main(void) { int pid = fork(); uint64_t xstate[512]; struct iovec iov = { .iov_base = xstate, .iov_len = sizeof(xstate) }; if (pid == 0) { bool tracee = true; for (int i = 0; i < sysconf(_SC_NPROCESSORS_ONLN) && tracee; i++) tracee = (fork() != 0); uint32_t xmm0[4] = { [0 ... 3] = tracee ? 0x00000000 : 0xDEADBEEF }; asm volatile(" movdqu %0, %%xmm0\n" " mov %0, %%rbx\n" "1: movdqu %%xmm0, %0\n" " mov %0, %%rax\n" " cmp %%rax, %%rbx\n" " je 1b\n" : "+m" (xmm0) : : "rax", "rbx", "xmm0"); printf("BUG: xmm registers corrupted! tracee=%d, xmm0=%08X%08X%08X%08X\n", tracee, xmm0[0], xmm0[1], xmm0[2], xmm0[3]); } else { usleep(100000); ptrace(PTRACE_ATTACH, pid, 0, 0); wait(NULL); ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); xstate[65] = -1; ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov); ptrace(PTRACE_CONT, pid, 0, 0); wait(NULL); } return 1; } Note: the program only tests for the bug using the ptrace() system call. The bug can also be reproduced using the rt_sigreturn() system call, but only when called from a 32-bit program, since for 64-bit programs the kernel restores the FPU state from the signal frame by doing XRSTOR directly from userspace memory (with proper error checking). Reported-by: Dmitry Vyukov Signed-off-by: Eric Biggers Reviewed-by: Kees Cook Reviewed-by: Rik van Riel Acked-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Fixes: 0b29643a5843 ("x86/xsaves: Change compacted format xsave area header") Link: http://lkml.kernel.org/r/20170922174156.16780-2-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20170923130016.21448-25-mingo@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/fpu/regset.c | 9 +++++++-- arch/x86/kernel/fpu/signal.c | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c114b132d121..7052d9a65fe9 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -130,11 +130,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) { ret = copyin_to_xsaves(kbuf, ubuf, xsave); - else + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + /* xcomp_bv must be 0 when using uncompacted format */ + if (!ret && xsave->header.xcomp_bv) + ret = -EINVAL; + } + /* * In case of failure, mark all states as init: */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a184c210efba..3ec0d2d64601 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -329,6 +329,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + + /* xcomp_bv must be 0 when using uncompacted format */ + if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) + err = -EINVAL; } if (err || __copy_from_user(&env, buf, sizeof(env))) { From e2f803481a84804811656a658c32176b7eec36e8 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Sep 2017 07:15:35 -0500 Subject: [PATCH 138/555] gfs2: Fix debugfs glocks dump commit 10201655b085df8e000822e496e5d4016a167a36 upstream. The switch to rhashtables (commit 88ffbf3e03) broke the debugfs glock dump (/sys/kernel/debug/gfs2//glocks) for dumps bigger than a single buffer: the right function for restarting an rhashtable iteration from the beginning of the hash table is rhashtable_walk_enter; rhashtable_walk_stop + rhashtable_walk_start will just resume from the current position. The upstream commit doesn't directly apply to 4.9.y because 4.9.y doesn't have the following mainline commits: 92ecd73a887c4a2b94daf5fc35179d75d1c4ef95 gfs2: Deduplicate gfs2_{glocks,glstats}_open cc37a62785a584f4875788689f3fd1fa6e4eb291 gfs2: Replace rhashtable_walk_init with rhashtable_walk_enter Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/glock.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 7bff6f46f5da..f7cae1629c6c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1836,13 +1836,9 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - int ret; - if (gi->last_pos <= *pos) - n = (*pos - gi->last_pos); - - ret = rhashtable_walk_start(&gi->hti); - if (ret) + rhashtable_walk_enter(&gl_hash_table, &gi->hti); + if (rhashtable_walk_start(&gi->hti) != 0) return NULL; do { @@ -1850,6 +1846,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) } while (gi->gl && n--); gi->last_pos = *pos; + return gi->gl; } @@ -1861,6 +1858,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi); + return gi->gl; } @@ -1870,6 +1868,7 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) gi->gl = NULL; rhashtable_walk_stop(&gi->hti); + rhashtable_walk_exit(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) @@ -1932,12 +1931,10 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); } return ret; } @@ -1948,7 +1945,6 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->gl = NULL; - rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } @@ -1960,12 +1956,10 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); } return ret; } From 4c00015385faccd992e98dfedfeaa07ac56d7194 Mon Sep 17 00:00:00 2001 From: Myungho Jung Date: Wed, 19 Apr 2017 15:24:50 -0700 Subject: [PATCH 139/555] timer/sysclt: Restrict timer migration sysctl values to 0 and 1 commit b94bf594cf8ed67cdd0439e70fa939783471597a upstream. timer_migration sysctl acts as a boolean switch, so the allowed values should be restricted to 0 and 1. Add the necessary extra fields to the sysctl table entry to enforce that. [ tglx: Rewrote changelog ] Signed-off-by: Myungho Jung Link: http://lkml.kernel.org/r/1492640690-3550-1-git-send-email-mhjungk@gmail.com Signed-off-by: Thomas Gleixner Cc: Kazuhiro Hayashi Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 2 ++ kernel/time/timer.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 265e0d0216e3..24d603d29512 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1189,6 +1189,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_BPF_SYSCALL diff --git a/kernel/time/timer.c b/kernel/time/timer.c index df445cde8a1e..7d670362891a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -240,7 +240,7 @@ int timer_migration_handler(struct ctl_table *table, int write, int ret; mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(false); mutex_unlock(&mutex); From 0c4e39ca67008b983d71fe23ee04c6a33ce4b5f4 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 18 Sep 2017 09:56:49 +0800 Subject: [PATCH 140/555] KVM: VMX: do not change SN bit in vmx_update_pi_irte() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit dc91f2eb1a4021eb6705c15e474942f84ab9b211 upstream. In kvm_vcpu_trigger_posted_interrupt() and pi_pre_block(), KVM assumes that PI notification events should not be suppressed when the target vCPU is not blocked. vmx_update_pi_irte() sets the SN field before changing an interrupt from posting to remapping, but it does not check the vCPU mode. Therefore, the change of SN field may break above the assumption. Besides, I don't see reasons to suppress notification events here, so remove the changes of SN field to avoid race condition. Signed-off-by: Haozhong Zhang Reported-by: "Ramamurthy, Venkatesh" Reported-by: Dan Williams Reviewed-by: Paolo Bonzini Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Radim Krčmář Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a29f5452f83d..81a3ab2afe6e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11215,12 +11215,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", From 3ffbe626a254b9af6d98881bc2cbb4f22771567e Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 18 Sep 2017 09:56:50 +0800 Subject: [PATCH 141/555] KVM: VMX: remove WARN_ON_ONCE in kvm_vcpu_trigger_posted_interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5753743fa5108b8f98bd61e40dc63f641b26c768 upstream. WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)) in kvm_vcpu_trigger_posted_interrupt() intends to detect the violation of invariant that VT-d PI notification event is not suppressed when vcpu is in the guest mode. Because the two checks for the target vcpu mode and the target suppress field cannot be performed atomically, the target vcpu mode may change in between. If that does happen, WARN_ON_ONCE() here may raise false alarms. As the previous patch fixed the real invariant breaker, remove this WARN_ON_ONCE() to avoid false alarms, and document the allowed cases instead. Signed-off-by: Haozhong Zhang Reported-by: "Ramamurthy, Venkatesh" Reported-by: Dan Williams Reviewed-by: Paolo Bonzini Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Radim Krčmář Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 81a3ab2afe6e..b6ee73eda289 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4759,21 +4759,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. + * + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. + * + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); From cb2da657d3a9218baf522514061c28f5b1fa900e Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Wed, 30 Aug 2017 12:15:49 +0200 Subject: [PATCH 142/555] cxl: Fix driver use count commit 197267d0356004a31c4d6b6336598f5dff3301e1 upstream. cxl keeps a driver use count, which is used with the hash memory model on p8 to know when to upgrade local TLBIs to global and to trigger callbacks to manage the MMU for PSL8. If a process opens a context and closes without attaching or fails the attachment, the driver use count is never decremented. As a consequence, TLB invalidations remain global, even if there are no active cxl contexts. We should increment the driver use count when the process is attaching to the cxl adapter, and not on open. It's not needed before the adapter starts using the context and the use count is decremented on the detach path, so it makes more sense. It affects only the user api. The kernel api is already doing The Right Thing. Signed-off-by: Frederic Barrat Cc: stable@vger.kernel.org # v4.2+ Fixes: 7bb5d91a4dda ("cxl: Rework context lifetimes") Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman [ajd: backport to stable v4.9 tree] Signed-off-by: Andrew Donnellan Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/api.c | 4 ++++ drivers/misc/cxl/file.c | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index 2e5233b60971..ae856161faa9 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -244,6 +244,10 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, ctx->real_mode = false; } + /* + * Increment driver use count. Enables global TLBIs for hash + * and callbacks to handle the segment table + */ cxl_ctx_get(); if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) { diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index afa211397048..d3e009438991 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -91,7 +91,6 @@ static int __afu_open(struct inode *inode, struct file *file, bool master) pr_devel("afu_open pe: %i\n", ctx->pe); file->private_data = ctx; - cxl_ctx_get(); /* indicate success */ rc = 0; @@ -213,6 +212,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID); + /* + * Increment driver use count. Enables global TLBIs for hash + * and callbacks to handle the segment table + */ + cxl_ctx_get(); + trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr); if ((rc = cxl_ops->attach_process(ctx, false, work.work_element_descriptor, @@ -222,6 +227,7 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, put_pid(ctx->glpid); put_pid(ctx->pid); ctx->glpid = ctx->pid = NULL; + cxl_ctx_put(); goto out; } From ea37f61f5de045ce72529ea95c7aa38c8187993c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Sep 2017 17:58:41 +0200 Subject: [PATCH 143/555] KVM: VMX: use cmpxchg64 commit c0a1666bcb2a33e84187a15eabdcd54056be9a97 upstream. This fixes a compilation failure on 32-bit systems. Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b6ee73eda289..fb49212d25df 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2203,8 +2203,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.ndst = (dest << 8) & 0xFF00; new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } static void decache_tsc_multiplier(struct vcpu_vmx *vmx) @@ -11039,8 +11039,8 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); @@ -11109,8 +11109,8 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); /* We should not block the vCPU if an interrupt is posted for it. */ if (pi_test_on(pi_desc) == 1) From 64afde6f956dfcb719e329a9d2098b53e68d2755 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Mon, 4 Sep 2017 16:00:50 +0200 Subject: [PATCH 144/555] video: fbdev: aty: do not leak uninitialized padding in clk to userspace commit 8e75f7a7a00461ef6d91797a60b606367f6e344d upstream. 'clk' is copied to a userland with padding byte(s) after 'vclk_post_div' field unitialized, leaking data from the stack. Fix this ensuring all of 'clk' is initialized to zero. References: https://github.com/torvalds/linux/pull/441 Reported-by: sohu0106 Signed-off-by: Vladis Dronov Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/aty/atyfb_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c index 11026e726b68..81367cf0af77 100644 --- a/drivers/video/fbdev/aty/atyfb_base.c +++ b/drivers/video/fbdev/aty/atyfb_base.c @@ -1861,7 +1861,7 @@ static int atyfb_ioctl(struct fb_info *info, u_int cmd, u_long arg) #if defined(DEBUG) && defined(CONFIG_FB_ATY_CT) case ATYIO_CLKR: if (M64_HAS(INTEGRATED)) { - struct atyclk clk; + struct atyclk clk = { 0 }; union aty_pll *pll = &par->pll; u32 dsp_config = pll->ct.dsp_config; u32 dsp_on_off = pll->ct.dsp_on_off; From df13283e4b8920ecd00d09eea0f041dc8f1df598 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 7 Feb 2017 19:58:02 +0200 Subject: [PATCH 145/555] swiotlb-xen: implement xen_swiotlb_dma_mmap callback commit 7e91c7df29b5e196de3dc6f086c8937973bd0b88 upstream. This function creates userspace mapping for the DMA-coherent memory. Signed-off-by: Stefano Stabellini Signed-off-by: Oleksandr Dmytryshyn Signed-off-by: Andrii Anisov Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman --- arch/arm/xen/mm.c | 1 + drivers/xen/swiotlb-xen.c | 19 +++++++++++++++++++ include/xen/swiotlb-xen.h | 5 +++++ 3 files changed, 25 insertions(+) diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index d062f08f5020..4b24964a520a 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -199,6 +199,7 @@ static struct dma_map_ops xen_swiotlb_dma_ops = { .unmap_page = xen_swiotlb_unmap_page, .dma_supported = xen_swiotlb_dma_supported, .set_dma_mask = xen_swiotlb_set_dma_mask, + .mmap = xen_swiotlb_dma_mmap, }; int __init xen_mm_init(void) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 679f79f68182..b68ced5a6331 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -680,3 +680,22 @@ xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) return 0; } EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); + +/* + * Create userspace mapping for the DMA-coherent memory. + * This function should be called with the pages from the current domain only, + * passing pages mapped from other domains would lead to memory corruption. + */ +int +xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) + if (__generic_dma_ops(dev)->mmap) + return __generic_dma_ops(dev)->mmap(dev, vma, cpu_addr, + dma_addr, size, attrs); +#endif + return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mmap); diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index 7c35e279d1e3..683057f79dca 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -58,4 +58,9 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask); extern int xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask); + +extern int +xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); #endif /* __LINUX_SWIOTLB_XEN_H */ From 1852eae92c460813692808234da35d142a405ab7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Oct 2017 09:44:17 +0200 Subject: [PATCH 146/555] Linux 4.9.53 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c53de1e38c6a..98e3be659b21 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 52 +SUBLEVEL = 53 EXTRAVERSION = NAME = Roaring Lionus From 81402eadec8c11e29a7f16b54f8f5e578d350f75 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Mon, 8 May 2017 09:33:22 -0700 Subject: [PATCH 147/555] ANDROID: binder: Add tracing for binder priority inheritance. Bug: 34461621 Change-Id: I5ebb1c0c49fd42a89ee250a1d70221f767c82c7c Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 4 ++++ drivers/android/binder_trace.h | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index fcf85be2674d..947d671dc257 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1153,6 +1153,10 @@ static void binder_do_set_priority(struct task_struct *task, task->pid, desired.prio, to_kernel_prio(policy, priority)); + trace_binder_set_priority(task->tgid, task->pid, task->normal_prio, + to_kernel_prio(policy, priority), + desired.prio); + /* Set the actual priority */ if (task->policy != policy || is_rt_policy(policy)) { struct sched_param params; diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 76e3b9c8a8a2..b11dffc521e8 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -85,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done); DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done); DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done); +TRACE_EVENT(binder_set_priority, + TP_PROTO(int proc, int thread, unsigned int old_prio, + unsigned int desired_prio, unsigned int new_prio), + TP_ARGS(proc, thread, old_prio, new_prio, desired_prio), + + TP_STRUCT__entry( + __field(int, proc) + __field(int, thread) + __field(unsigned int, old_prio) + __field(unsigned int, new_prio) + __field(unsigned int, desired_prio) + ), + TP_fast_assign( + __entry->proc = proc; + __entry->thread = thread; + __entry->old_prio = old_prio; + __entry->new_prio = new_prio; + __entry->desired_prio = desired_prio; + ), + TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d", + __entry->proc, __entry->thread, __entry->old_prio, + __entry->new_prio, __entry->desired_prio) +); + TRACE_EVENT(binder_wait_for_work, TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo), TP_ARGS(proc_work, transaction_stack, thread_todo), From 3217cccb816110165a3d15cf6e03e0127e62741c Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 24 Aug 2017 15:23:36 +0200 Subject: [PATCH 148/555] ANDROID: binder: fix transaction leak. If a call to put_user() fails, we failed to properly free a transaction and send a failed reply (if necessary). Bug: 63117588 Test: binderLibTest Change-Id: Ia98db8cd82ce354a4cdc8811c969988d585c7e31 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 947d671dc257..256a1d5bcfc6 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2105,6 +2105,26 @@ static void binder_send_failed_reply(struct binder_transaction *t, } } +/** + * binder_cleanup_transaction() - cleans up undelivered transaction + * @t: transaction that needs to be cleaned up + * @reason: reason the transaction wasn't delivered + * @error_code: error to return to caller (if synchronous call) + */ +static void binder_cleanup_transaction(struct binder_transaction *t, + const char *reason, + uint32_t error_code) +{ + if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) { + binder_send_failed_reply(t, error_code); + } else { + binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, + "undelivered transaction %d, %s\n", + t->debug_id, reason); + binder_free_transaction(t); + } +} + /** * binder_validate_object() - checks for a valid metadata object in a buffer. * @buffer: binder_buffer that we're parsing. @@ -4187,12 +4207,20 @@ retry: if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); + + binder_cleanup_transaction(t, "put_user failed", + BR_FAILED_REPLY); + return -EFAULT; } ptr += sizeof(uint32_t); if (copy_to_user(ptr, &tr, sizeof(tr))) { if (t_from) binder_thread_dec_tmpref(t_from); + + binder_cleanup_transaction(t, "copy_to_user failed", + BR_FAILED_REPLY); + return -EFAULT; } ptr += sizeof(tr); @@ -4262,15 +4290,9 @@ static void binder_release_work(struct binder_proc *proc, struct binder_transaction *t; t = container_of(w, struct binder_transaction, work); - if (t->buffer->target_node && - !(t->flags & TF_ONE_WAY)) { - binder_send_failed_reply(t, BR_DEAD_REPLY); - } else { - binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, - "undelivered transaction %d\n", - t->debug_id); - binder_free_transaction(t); - } + + binder_cleanup_transaction(t, "process died.", + BR_DEAD_REPLY); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( From a7e410c1359f330a2a7627fb9ce5d53688be122a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 5 Oct 2017 17:54:31 -0700 Subject: [PATCH 149/555] FROMLIST: tracing: Prepare to add preempt and irq trace events In preparation of adding irqsoff and preemptsoff enable and disable trace events, move required functions and code to make it easier to add these events in a later patch. This patch is just code movement and no functional change. Change-Id: I587d411da5efbc4959bcccd7a05c7a66c231e1e0 Cc: Steven Rostedt Cc: Peter Zijlstra Cc: kernel-team@android.com Link: https://patchwork.kernel.org/patch/9988159/ Signed-off-by: Joel Fernandes --- kernel/trace/trace_irqsoff.c | 100 ++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03cdff84d026..eee897b9e185 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,7 @@ #include "trace.h" +#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -450,64 +451,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) #else /* !CONFIG_PROVE_LOCKING */ -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} - /* * We are only interested in hardirq on/off events: */ -void trace_hardirqs_on(void) +static inline void tracer_hardirqs_on(void) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_on); -void trace_hardirqs_off(void) +static inline void tracer_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_off); -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_on_caller); -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -void trace_preempt_on(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } -void trace_preempt_off(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); @@ -769,3 +750,70 @@ __init static int init_irqsoff_tracer(void) return 0; } core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ + +#ifndef CONFIG_IRQSOFF_TRACER +static inline void tracer_hardirqs_on(void) { } +static inline void tracer_hardirqs_off(void) { } +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#endif + +#ifndef CONFIG_PREEMPT_TRACER +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) +void trace_hardirqs_on(void) +{ + tracer_hardirqs_on(); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + tracer_hardirqs_off(); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + tracer_hardirqs_on_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + tracer_hardirqs_off_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +/* + * Stubs: + */ + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} +#endif + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + tracer_preempt_off(a0, a1); +} +#endif From 2b3a26c86b930d1efab55972052ab2e16336e034 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 5 Oct 2017 17:54:32 -0700 Subject: [PATCH 150/555] FROMLIST: tracing: Add support for preempt and irq enable/disable events Preempt and irq trace events can be used for tracing the start and end of an atomic section which can be used by a trace viewer like systrace to graphically view the start and end of an atomic section and correlate them with latencies and scheduling issues. This also serves as a prelude to using synthetic events or probes to rewrite the preempt and irqsoff tracers, along with numerous benefits of using trace events features for these events. Change-Id: I4f992714c0161a415b17a03aa14cce823d8ca3bb Cc: Steven Rostedt Cc: Peter Zilstra Cc: kernel-team@android.com Link: https://patchwork.kernel.org/patch/9988157/ Signed-off-by: Joel Fernandes --- include/linux/ftrace.h | 3 +- include/trace/events/preemptirq.h | 70 +++++++++++++++++++++++++++++++ kernel/trace/Kconfig | 11 +++++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 35 +++++++++++++++- 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/preemptirq.h diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b3d34d3e0e7e..f4c0d36baffb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -734,7 +734,8 @@ static inline unsigned long get_lock_parent_ip(void) static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif -#ifdef CONFIG_PREEMPT_TRACER +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) extern void trace_preempt_on(unsigned long a0, unsigned long a1); extern void trace_preempt_off(unsigned long a0, unsigned long a1); #else diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h new file mode 100644 index 000000000000..f5024c560d8f --- /dev/null +++ b/include/trace/events/preemptirq.h @@ -0,0 +1,70 @@ +#ifdef CONFIG_PREEMPTIRQ_EVENTS + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM preemptirq + +#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PREEMPTIRQ_H + +#include +#include +#include +#include + +DECLARE_EVENT_CLASS(preemptirq_template, + + TP_PROTO(unsigned long ip, unsigned long parent_ip), + + TP_ARGS(ip, parent_ip), + + TP_STRUCT__entry( + __field(u32, caller_offs) + __field(u32, parent_offs) + ), + + TP_fast_assign( + __entry->caller_offs = (u32)(ip - (unsigned long)_stext); + __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext); + ), + + TP_printk("caller=%pF parent=%pF", + (void *)((unsigned long)(_stext) + __entry->caller_offs), + (void *)((unsigned long)(_stext) + __entry->parent_offs)) +); + +#ifndef CONFIG_PROVE_LOCKING +DEFINE_EVENT(preemptirq_template, irq_disable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); + +DEFINE_EVENT(preemptirq_template, irq_enable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); +#endif + +#ifdef CONFIG_DEBUG_PREEMPT +DEFINE_EVENT(preemptirq_template, preempt_disable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); + +DEFINE_EVENT(preemptirq_template, preempt_enable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); +#endif + +#endif /* _TRACE_PREEMPTIRQ_H */ + +#include + +#else /* !CONFIG_PREEMPTIRQ_EVENTS */ + +#define trace_irq_enable(...) +#define trace_irq_disable(...) +#define trace_preempt_enable(...) +#define trace_preempt_disable(...) +#define trace_irq_enable_rcuidle(...) +#define trace_irq_disable_rcuidle(...) +#define trace_preempt_enable_rcuidle(...) +#define trace_preempt_disable_rcuidle(...) + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index da5768901a0d..5771ce7f08b3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER address on the current task structure into a stack of calls. +config PREEMPTIRQ_EVENTS + bool "Enable trace events for preempt and irq disable/enable" + select TRACE_IRQFLAGS + depends on DEBUG_PREEMPT || !PROVE_LOCKING + default n + help + Enable tracing of disable and enable events for preemption and irqs. + For tracing preempt disable/enable events, DEBUG_PREEMPT must be + enabled. For tracing irq disable/enable events, PROVE_LOCKING must + be disabled. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b4eaf9c9c610..907c14446dbd 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index eee897b9e185..c180fe513eb2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,9 @@ #include "trace.h" +#define CREATE_TRACE_POINTS +#include + #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -764,27 +767,54 @@ static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } #endif +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) void trace_hardirqs_on(void) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_off(); } EXPORT_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_on_caller(caller_addr); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_off_caller(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -806,14 +836,17 @@ inline void print_irqtrace_events(struct task_struct *curr) } #endif -#ifdef CONFIG_PREEMPT_TRACER +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) void trace_preempt_on(unsigned long a0, unsigned long a1) { + trace_preempt_enable_rcuidle(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { + trace_preempt_disable_rcuidle(a0, a1); tracer_preempt_off(a0, a1); } #endif From 7df306f1063b11407352b61eb014a918a6809c96 Mon Sep 17 00:00:00 2001 From: "Kristian H. Kristensen" Date: Tue, 13 Dec 2016 11:27:52 -0800 Subject: [PATCH 151/555] drm_fourcc: Fix DRM_FORMAT_MOD_LINEAR #define [ Upstream commit af913418261d6d3e7a29f06cf35f04610ead667c ] We need to define DRM_FORMAT_MOD_VENDOR_NONE for the fourcc_mod_code() macro to work correctly. Signed-off-by: Kristian H. Kristensen Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1481657272-25975-1-git-send-email-hoegsberg@google.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/uapi/drm/drm_fourcc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index a5890bf44c0a..d1601a621929 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -154,6 +154,7 @@ extern "C" { /* Vendor Ids: */ #define DRM_FORMAT_MOD_NONE 0 +#define DRM_FORMAT_MOD_VENDOR_NONE 0 #define DRM_FORMAT_MOD_VENDOR_INTEL 0x01 #define DRM_FORMAT_MOD_VENDOR_AMD 0x02 #define DRM_FORMAT_MOD_VENDOR_NV 0x03 From e236940a87f125de78f94efc44d4a6db043c26d6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 13 Dec 2016 11:09:16 +0100 Subject: [PATCH 152/555] drm: bridge: add DT bindings for TI ths8135 [ Upstream commit 2e644be30fcc08c736f66b60f4898d274d4873ab ] THS8135 is a configurable video DAC. Add DT bindings for this chip. Signed-off-by: Bartosz Golaszewski Reviewed-by: Laurent Pinchart Acked-by: Rob Herring Signed-off-by: Archit Taneja Link: http://patchwork.freedesktop.org/patch/msgid/1481623759-12786-3-git-send-email-bgolaszewski@baylibre.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../bindings/display/bridge/ti,ths8135.txt | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt diff --git a/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt b/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt new file mode 100644 index 000000000000..6ec1a880ac18 --- /dev/null +++ b/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt @@ -0,0 +1,46 @@ +THS8135 Video DAC +----------------- + +This is the binding for Texas Instruments THS8135 Video DAC bridge. + +Required properties: + +- compatible: Must be "ti,ths8135" + +Required nodes: + +This device has two video ports. Their connections are modelled using the OF +graph bindings specified in Documentation/devicetree/bindings/graph.txt. + +- Video port 0 for RGB input +- Video port 1 for VGA output + +Example +------- + +vga-bridge { + compatible = "ti,ths8135"; + #address-cells = <1>; + #size-cells = <0>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + + vga_bridge_in: endpoint { + remote-endpoint = <&lcdc_out_vga>; + }; + }; + + port@1 { + reg = <1>; + + vga_bridge_out: endpoint { + remote-endpoint = <&vga_con_in>; + }; + }; + }; +}; From 97766c6a8e586308d89309591f73aa3bc5ce0643 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Dec 2016 08:02:03 -0600 Subject: [PATCH 153/555] GFS2: Fix reference to ERR_PTR in gfs2_glock_iter_next [ Upstream commit 14d37564fa3dc4e5d4c6828afcd26ac14e6796c5 ] This patch fixes a place where function gfs2_glock_iter_next can reference an invalid error pointer. Signed-off-by: Dan Carpenter Signed-off-by: Bob Peterson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/glock.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f7cae1629c6c..7a8b1d72e3d9 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1820,16 +1820,18 @@ void gfs2_glock_exit(void) static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { - do { - gi->gl = rhashtable_walk_next(&gi->hti); + while ((gi->gl = rhashtable_walk_next(&gi->hti))) { if (IS_ERR(gi->gl)) { if (PTR_ERR(gi->gl) == -EAGAIN) continue; gi->gl = NULL; + return; } - /* Skip entries for other sb and dead entries */ - } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) || - __lockref_is_dead(&gi->gl->gl_lockref))); + /* Skip entries for other sb and dead entries */ + if (gi->sdp == gi->gl->gl_name.ln_sbd && + !__lockref_is_dead(&gi->gl->gl_lockref)) + return; + } } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) From 48167acb7f5b936b009eb824dfb3cfb7e1a42502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 7 Dec 2016 19:28:06 +0200 Subject: [PATCH 154/555] drm/i915: Fix the overlay frontbuffer tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 58d09ebdb4edf5d3ab3a2aee851ab0168bc83ec6 ] Do the overlay frontbuffer tracking properly so that it matches the state of the overlay on/off/continue requests. One slight problem is that intel_frontbuffer_flip_complete() may get delayed by an arbitrarily liong time due to the fact that the overlay code likes to bail out when a signal occurs. So the flip may not get completed until the ioctl is restarted. But fixing that would require bigger surgery, so I decided to ignore it for now. Signed-off-by: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/1481131693-27993-5-git-send-email-ville.syrjala@linux.intel.com Reviewed-by: Chris Wilson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_overlay.c | 72 +++++++++++++++++----------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c index a2655cd5a84e..8ab6f30dc23c 100644 --- a/drivers/gpu/drm/i915/intel_overlay.c +++ b/drivers/gpu/drm/i915/intel_overlay.c @@ -272,8 +272,30 @@ static int intel_overlay_on(struct intel_overlay *overlay) return intel_overlay_do_wait_request(overlay, req, NULL); } +static void intel_overlay_flip_prepare(struct intel_overlay *overlay, + struct i915_vma *vma) +{ + enum pipe pipe = overlay->crtc->pipe; + + WARN_ON(overlay->old_vma); + + i915_gem_track_fb(overlay->vma ? overlay->vma->obj : NULL, + vma ? vma->obj : NULL, + INTEL_FRONTBUFFER_OVERLAY(pipe)); + + intel_frontbuffer_flip_prepare(overlay->i915, + INTEL_FRONTBUFFER_OVERLAY(pipe)); + + overlay->old_vma = overlay->vma; + if (vma) + overlay->vma = i915_vma_get(vma); + else + overlay->vma = NULL; +} + /* overlay needs to be enabled in OCMD reg */ static int intel_overlay_continue(struct intel_overlay *overlay, + struct i915_vma *vma, bool load_polyphase_filter) { struct drm_i915_private *dev_priv = overlay->i915; @@ -308,27 +330,35 @@ static int intel_overlay_continue(struct intel_overlay *overlay, intel_ring_emit(ring, flip_addr); intel_ring_advance(ring); + intel_overlay_flip_prepare(overlay, vma); + intel_overlay_submit_request(overlay, req, NULL); return 0; } +static void intel_overlay_release_old_vma(struct intel_overlay *overlay) +{ + struct i915_vma *vma; + + vma = fetch_and_zero(&overlay->old_vma); + if (WARN_ON(!vma)) + return; + + intel_frontbuffer_flip_complete(overlay->i915, + INTEL_FRONTBUFFER_OVERLAY(overlay->crtc->pipe)); + + i915_gem_object_unpin_from_display_plane(vma); + i915_vma_put(vma); +} + static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active, struct drm_i915_gem_request *req) { struct intel_overlay *overlay = container_of(active, typeof(*overlay), last_flip); - struct i915_vma *vma; - vma = fetch_and_zero(&overlay->old_vma); - if (WARN_ON(!vma)) - return; - - i915_gem_track_fb(vma->obj, NULL, - INTEL_FRONTBUFFER_OVERLAY(overlay->crtc->pipe)); - - i915_gem_object_unpin_from_display_plane(vma); - i915_vma_put(vma); + intel_overlay_release_old_vma(overlay); } static void intel_overlay_off_tail(struct i915_gem_active *active, @@ -336,15 +366,8 @@ static void intel_overlay_off_tail(struct i915_gem_active *active, { struct intel_overlay *overlay = container_of(active, typeof(*overlay), last_flip); - struct i915_vma *vma; - /* never have the overlay hw on without showing a frame */ - vma = fetch_and_zero(&overlay->vma); - if (WARN_ON(!vma)) - return; - - i915_gem_object_unpin_from_display_plane(vma); - i915_vma_put(vma); + intel_overlay_release_old_vma(overlay); overlay->crtc->overlay = NULL; overlay->crtc = NULL; @@ -398,6 +421,8 @@ static int intel_overlay_off(struct intel_overlay *overlay) } intel_ring_advance(ring); + intel_overlay_flip_prepare(overlay, NULL); + return intel_overlay_do_wait_request(overlay, req, intel_overlay_off_tail); } @@ -836,18 +861,10 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay, intel_overlay_unmap_regs(overlay, regs); - ret = intel_overlay_continue(overlay, scale_changed); + ret = intel_overlay_continue(overlay, vma, scale_changed); if (ret) goto out_unpin; - i915_gem_track_fb(overlay->vma ? overlay->vma->obj : NULL, - vma->obj, INTEL_FRONTBUFFER_OVERLAY(pipe)); - - overlay->old_vma = overlay->vma; - overlay->vma = vma; - - intel_frontbuffer_flip(dev_priv, INTEL_FRONTBUFFER_OVERLAY(pipe)); - return 0; out_unpin: @@ -1215,6 +1232,7 @@ int intel_overlay_put_image_ioctl(struct drm_device *dev, void *data, mutex_unlock(&dev->struct_mutex); drm_modeset_unlock_all(dev); + i915_gem_object_put(new_bo); kfree(params); From 299b924c1f20fc428221bce79c5318684af79347 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Thu, 29 Dec 2016 14:36:51 +0100 Subject: [PATCH 155/555] ARM: dts: exynos: Add CPU OPPs for Exynos4412 Prime [ Upstream commit 80b7a2e2498bcffb1a79980dfbeb7a1275577b28 ] Add CPU operating points for Exynos4412 Prime (it supports additional 1704MHz & 1600MHz OPPs and 1500MHz OPP is just a regular non-turbo OPP on this SoC). Also update relevant cooling maps to account for new OPPs. ODROID-X2/U2/U3 boards use Exynos4412 Prime SoC version so update their board files accordingly. Based on Hardkernel's kernel for ODROID-X2/U2/U3 boards. Cc: Doug Anderson Cc: Andreas Faerber Cc: Thomas Abraham Cc: Tobias Jakobi Cc: Ben Gamari Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../boot/dts/exynos4412-odroid-common.dtsi | 4 +- arch/arm/boot/dts/exynos4412-odroidu3.dts | 5 ++- arch/arm/boot/dts/exynos4412-odroidx2.dts | 1 + arch/arm/boot/dts/exynos4412-prime.dtsi | 41 +++++++++++++++++++ arch/arm/boot/dts/exynos4412.dtsi | 2 +- 5 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 arch/arm/boot/dts/exynos4412-prime.dtsi diff --git a/arch/arm/boot/dts/exynos4412-odroid-common.dtsi b/arch/arm/boot/dts/exynos4412-odroid-common.dtsi index 8aa19ba14436..5282d69e55bd 100644 --- a/arch/arm/boot/dts/exynos4412-odroid-common.dtsi +++ b/arch/arm/boot/dts/exynos4412-odroid-common.dtsi @@ -97,11 +97,11 @@ thermal-zones { cpu_thermal: cpu-thermal { cooling-maps { - map0 { + cooling_map0: map0 { /* Corresponds to 800MHz at freq_table */ cooling-device = <&cpu0 7 7>; }; - map1 { + cooling_map1: map1 { /* Corresponds to 200MHz at freq_table */ cooling-device = <&cpu0 13 13>; }; diff --git a/arch/arm/boot/dts/exynos4412-odroidu3.dts b/arch/arm/boot/dts/exynos4412-odroidu3.dts index 99634c54dca9..7504a5aa538e 100644 --- a/arch/arm/boot/dts/exynos4412-odroidu3.dts +++ b/arch/arm/boot/dts/exynos4412-odroidu3.dts @@ -13,6 +13,7 @@ /dts-v1/; #include "exynos4412-odroid-common.dtsi" +#include "exynos4412-prime.dtsi" / { model = "Hardkernel ODROID-U3 board based on Exynos4412"; @@ -47,11 +48,11 @@ cooling-maps { map0 { trip = <&cpu_alert1>; - cooling-device = <&cpu0 7 7>; + cooling-device = <&cpu0 9 9>; }; map1 { trip = <&cpu_alert2>; - cooling-device = <&cpu0 13 13>; + cooling-device = <&cpu0 15 15>; }; map2 { trip = <&cpu_alert0>; diff --git a/arch/arm/boot/dts/exynos4412-odroidx2.dts b/arch/arm/boot/dts/exynos4412-odroidx2.dts index 4d228858f172..d6e92ebc3874 100644 --- a/arch/arm/boot/dts/exynos4412-odroidx2.dts +++ b/arch/arm/boot/dts/exynos4412-odroidx2.dts @@ -12,6 +12,7 @@ */ #include "exynos4412-odroidx.dts" +#include "exynos4412-prime.dtsi" / { model = "Hardkernel ODROID-X2 board based on Exynos4412"; diff --git a/arch/arm/boot/dts/exynos4412-prime.dtsi b/arch/arm/boot/dts/exynos4412-prime.dtsi new file mode 100644 index 000000000000..e75bc170c89c --- /dev/null +++ b/arch/arm/boot/dts/exynos4412-prime.dtsi @@ -0,0 +1,41 @@ +/* + * Samsung's Exynos4412 Prime SoC device tree source + * + * Copyright (c) 2016 Samsung Electronics Co., Ltd. + * http://www.samsung.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Exynos4412 Prime SoC revision supports higher CPU frequencies than + * non-Prime version. Therefore we need to update OPPs table and + * thermal maps accordingly. + */ + +&cpu0_opp_1500 { + /delete-property/turbo-mode; +}; + +&cpu0_opp_table { + opp@1600000000 { + opp-hz = /bits/ 64 <1600000000>; + opp-microvolt = <1350000>; + clock-latency-ns = <200000>; + }; + opp@1704000000 { + opp-hz = /bits/ 64 <1704000000>; + opp-microvolt = <1350000>; + clock-latency-ns = <200000>; + }; +}; + +&cooling_map0 { + cooling-device = <&cpu0 9 9>; +}; + +&cooling_map1 { + cooling-device = <&cpu0 15 15>; +}; diff --git a/arch/arm/boot/dts/exynos4412.dtsi b/arch/arm/boot/dts/exynos4412.dtsi index 40beede46e55..3ebdf01d814c 100644 --- a/arch/arm/boot/dts/exynos4412.dtsi +++ b/arch/arm/boot/dts/exynos4412.dtsi @@ -130,7 +130,7 @@ opp-microvolt = <1287500>; clock-latency-ns = <200000>; }; - opp@1500000000 { + cpu0_opp_1500: opp@1500000000 { opp-hz = /bits/ 64 <1500000000>; opp-microvolt = <1350000>; clock-latency-ns = <200000>; From aa07a2ccc80d4a0fbb402e9eb2ba3912a45af807 Mon Sep 17 00:00:00 2001 From: Ondrej Jirman Date: Fri, 25 Nov 2016 01:28:47 +0100 Subject: [PATCH 156/555] clk: sunxi-ng: fix PLL_CPUX adjusting on H3 [ Upstream commit a43c96427e713bea94e9ef50e8be1f493afc0691 ] When adjusting PLL_CPUX on H3, the PLL is temporarily driven too high, and the system becomes unstable (oopses or hangs). Add a notifier to avoid this situation by temporarily switching to a known stable 24 MHz oscillator. Signed-off-by: Ondrej Jirman Tested-by: Lutz Sammer Acked-by: Chen-Yu Tsai Signed-off-by: Maxime Ripard Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/clk/sunxi-ng/ccu-sun8i-h3.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-h3.c b/drivers/clk/sunxi-ng/ccu-sun8i-h3.c index 21c427d86f28..a26c8a19fe93 100644 --- a/drivers/clk/sunxi-ng/ccu-sun8i-h3.c +++ b/drivers/clk/sunxi-ng/ccu-sun8i-h3.c @@ -803,6 +803,13 @@ static const struct sunxi_ccu_desc sun8i_h3_ccu_desc = { .num_resets = ARRAY_SIZE(sun8i_h3_ccu_resets), }; +static struct ccu_mux_nb sun8i_h3_cpu_nb = { + .common = &cpux_clk.common, + .cm = &cpux_clk.mux, + .delay_us = 1, /* > 8 clock cycles at 24 MHz */ + .bypass_index = 1, /* index of 24 MHz oscillator */ +}; + static void __init sun8i_h3_ccu_setup(struct device_node *node) { void __iomem *reg; @@ -821,6 +828,9 @@ static void __init sun8i_h3_ccu_setup(struct device_node *node) writel(val | (3 << 16), reg + SUN8I_H3_PLL_AUDIO_REG); sunxi_ccu_probe(node, reg, &sun8i_h3_ccu_desc); + + ccu_mux_notifier_register(pll_cpux_clk.common.hw.clk, + &sun8i_h3_cpu_nb); } CLK_OF_DECLARE(sun8i_h3_ccu, "allwinner,sun8i-h3-ccu", sun8i_h3_ccu_setup); From 3311a304ec62c4bc466656638bf6ea947b4eb7ec Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 18 Feb 2016 20:06:47 -0800 Subject: [PATCH 157/555] RDS: RDMA: Fix the composite message user notification [ Upstream commit 941f8d55f6d613a460a5e080d25a38509f45eb75 ] When application sends an RDS RDMA composite message consist of RDMA transfer to be followed up by non RDMA payload, it expect to be notified *only* when the full message gets delivered. RDS RDMA notification doesn't behave this way though. Thanks to Venkat for debug and root casuing the issue where only first part of the message(RDMA) was successfully delivered but remainder payload delivery failed. In that case, application should not be notified with a false positive of message delivery success. Fix this case by making sure the user gets notified only after the full message delivery. Reviewed-by: Venkat Venkatsubra Signed-off-by: Santosh Shilimkar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/rds/ib_send.c | 25 +++++++++++++++---------- net/rds/rdma.c | 10 ++++++++++ net/rds/rds.h | 1 + net/rds/send.c | 4 +++- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 84d90c97332f..191098173018 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm, complete(rm, notify_status); } -static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, - struct rm_data_op *op, - int wc_status) -{ - if (op->op_nents) - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - DMA_TO_DEVICE); -} - static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, struct rm_rdma_op *op, int wc_status) @@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_atomic_fadd); } +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + struct rds_message *rm = container_of(op, struct rds_message, data); + + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); + + if (rm->rdma.op_active && rm->data.op_notify) + rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status); +} + /* * Unmap the resources associated with a struct send_work. * diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4c93badeabf2..8d3a851a3476 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -626,6 +626,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; + + /* Enable rmda notification on data operation for composite + * rds messages and make sure notification is enabled only + * for the data operation which follows it so that application + * gets notified only after full message gets delivered. + */ + if (rm->data.op_sg) { + rm->rdma.op_notify = 0; + rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + } } /* The cookie contains the R_Key of the remote memory region, and diff --git a/net/rds/rds.h b/net/rds/rds.h index 67ba67c058b1..f107a968ddff 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -414,6 +414,7 @@ struct rds_message { } rdma; struct rm_data_op { unsigned int op_active:1; + unsigned int op_notify:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; diff --git a/net/rds/send.c b/net/rds/send.c index 896626b9a0ef..f28651b6ae83 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -475,12 +475,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; + unsigned int notify = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); + notify = rm->rdma.op_notify | rm->data.op_notify; ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_active && notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); From d1d3a78f3e8f2d2acff540d8576433eefa1ef622 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 20 Dec 2016 11:32:39 +0100 Subject: [PATCH 158/555] ARM: dts: r8a7790: Use R-Car Gen 2 fallback binding for msiof nodes [ Upstream commit 654450baf2afba86cf328e1849ccac61ec4630af ] Use recently added R-Car Gen 2 fallback binding for msiof nodes in DT for r8a7790 SoC. This has no run-time effect for the current driver as the initialisation sequence is the same for the SoC-specific binding for r8a7790 and the fallback binding for R-Car Gen 2. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/r8a7790.dtsi | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/r8a7790.dtsi b/arch/arm/boot/dts/r8a7790.dtsi index 351fcc2f87df..b6c6410ca384 100644 --- a/arch/arm/boot/dts/r8a7790.dtsi +++ b/arch/arm/boot/dts/r8a7790.dtsi @@ -1493,7 +1493,8 @@ }; msiof0: spi@e6e20000 { - compatible = "renesas,msiof-r8a7790"; + compatible = "renesas,msiof-r8a7790", + "renesas,rcar-gen2-msiof"; reg = <0 0xe6e20000 0 0x0064>; interrupts = ; clocks = <&mstp0_clks R8A7790_CLK_MSIOF0>; @@ -1507,7 +1508,8 @@ }; msiof1: spi@e6e10000 { - compatible = "renesas,msiof-r8a7790"; + compatible = "renesas,msiof-r8a7790", + "renesas,rcar-gen2-msiof"; reg = <0 0xe6e10000 0 0x0064>; interrupts = ; clocks = <&mstp2_clks R8A7790_CLK_MSIOF1>; @@ -1521,7 +1523,8 @@ }; msiof2: spi@e6e00000 { - compatible = "renesas,msiof-r8a7790"; + compatible = "renesas,msiof-r8a7790", + "renesas,rcar-gen2-msiof"; reg = <0 0xe6e00000 0 0x0064>; interrupts = ; clocks = <&mstp2_clks R8A7790_CLK_MSIOF2>; @@ -1535,7 +1538,8 @@ }; msiof3: spi@e6c90000 { - compatible = "renesas,msiof-r8a7790"; + compatible = "renesas,msiof-r8a7790", + "renesas,rcar-gen2-msiof"; reg = <0 0xe6c90000 0 0x0064>; interrupts = ; clocks = <&mstp2_clks R8A7790_CLK_MSIOF3>; From 84eaa74d734afb0f13bc6c3a9eb9339d105048d5 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Mon, 7 Nov 2016 11:52:19 +0000 Subject: [PATCH 159/555] MIPS: Ensure bss section ends on a long-aligned address [ Upstream commit 3f00f4d8f083bc61005d0a1ef592b149f5c88bbd ] When clearing the .bss section in kernel_entry we do so using LONG_S instructions, and branch whilst the current write address doesn't equal the end of the .bss section minus the size of a long integer. The .bss section always begins at a long-aligned address and we always increment the write pointer by the size of a long integer - we therefore rely upon the .bss section ending at a long-aligned address. If this is not the case then the long-aligned write address can never be equal to the non-long-aligned end address & we will continue to increment past the end of the .bss section, attempting to zero the rest of memory. Despite this requirement that .bss end at a long-aligned address we pass 0 as the end alignment requirement to the BSS_SECTION macro and thus don't guarantee any particular alignment, allowing us to hit the error condition described above. Fix this by instead passing 8 bytes as the end alignment argument to the BSS_SECTION macro, ensuring that the end of the .bss section is always at least long-aligned. Signed-off-by: Paul Burton Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/14526/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index d5de67591735..f0a0e6d62be3 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -182,7 +182,7 @@ SECTIONS * Force .bss to 64K alignment so that .bss..swapper_pg_dir * gets that alignment. .sbss should be empty, so there will be * no holes after __init_end. */ - BSS_SECTION(0, 0x10000, 0) + BSS_SECTION(0, 0x10000, 8) _end = . ; From 0e22be793ad292d9a7e4dfb9d757151815df6c59 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 20 Dec 2016 19:12:43 +0100 Subject: [PATCH 160/555] MIPS: ralink: Fix a typo in the pinmux setup. [ Upstream commit 58181a117d353427127a2e7afc7cf1ab44759828 ] There is a typo inside the pinmux setup code. The function is really called utif and not util. This was recently discovered when people were trying to make the UTIF interface work. Signed-off-by: John Crispin Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/14899/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/ralink/mt7620.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/mips/ralink/mt7620.c b/arch/mips/ralink/mt7620.c index 3c7c9bf57bf3..6f892c1f3ad7 100644 --- a/arch/mips/ralink/mt7620.c +++ b/arch/mips/ralink/mt7620.c @@ -176,7 +176,7 @@ static struct rt2880_pmx_func spi_cs1_grp_mt7628[] = { static struct rt2880_pmx_func spis_grp_mt7628[] = { FUNC("pwm_uart2", 3, 14, 4), - FUNC("util", 2, 14, 4), + FUNC("utif", 2, 14, 4), FUNC("gpio", 1, 14, 4), FUNC("spis", 0, 14, 4), }; @@ -190,28 +190,28 @@ static struct rt2880_pmx_func gpio_grp_mt7628[] = { static struct rt2880_pmx_func p4led_kn_grp_mt7628[] = { FUNC("jtag", 3, 30, 1), - FUNC("util", 2, 30, 1), + FUNC("utif", 2, 30, 1), FUNC("gpio", 1, 30, 1), FUNC("p4led_kn", 0, 30, 1), }; static struct rt2880_pmx_func p3led_kn_grp_mt7628[] = { FUNC("jtag", 3, 31, 1), - FUNC("util", 2, 31, 1), + FUNC("utif", 2, 31, 1), FUNC("gpio", 1, 31, 1), FUNC("p3led_kn", 0, 31, 1), }; static struct rt2880_pmx_func p2led_kn_grp_mt7628[] = { FUNC("jtag", 3, 32, 1), - FUNC("util", 2, 32, 1), + FUNC("utif", 2, 32, 1), FUNC("gpio", 1, 32, 1), FUNC("p2led_kn", 0, 32, 1), }; static struct rt2880_pmx_func p1led_kn_grp_mt7628[] = { FUNC("jtag", 3, 33, 1), - FUNC("util", 2, 33, 1), + FUNC("utif", 2, 33, 1), FUNC("gpio", 1, 33, 1), FUNC("p1led_kn", 0, 33, 1), }; @@ -232,28 +232,28 @@ static struct rt2880_pmx_func wled_kn_grp_mt7628[] = { static struct rt2880_pmx_func p4led_an_grp_mt7628[] = { FUNC("jtag", 3, 39, 1), - FUNC("util", 2, 39, 1), + FUNC("utif", 2, 39, 1), FUNC("gpio", 1, 39, 1), FUNC("p4led_an", 0, 39, 1), }; static struct rt2880_pmx_func p3led_an_grp_mt7628[] = { FUNC("jtag", 3, 40, 1), - FUNC("util", 2, 40, 1), + FUNC("utif", 2, 40, 1), FUNC("gpio", 1, 40, 1), FUNC("p3led_an", 0, 40, 1), }; static struct rt2880_pmx_func p2led_an_grp_mt7628[] = { FUNC("jtag", 3, 41, 1), - FUNC("util", 2, 41, 1), + FUNC("utif", 2, 41, 1), FUNC("gpio", 1, 41, 1), FUNC("p2led_an", 0, 41, 1), }; static struct rt2880_pmx_func p1led_an_grp_mt7628[] = { FUNC("jtag", 3, 42, 1), - FUNC("util", 2, 42, 1), + FUNC("utif", 2, 42, 1), FUNC("gpio", 1, 42, 1), FUNC("p1led_an", 0, 42, 1), }; From 0cde56d3b6721e8a69260861d268a8adcc6fcd33 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 Dec 2016 23:52:58 +0000 Subject: [PATCH 161/555] MIPS: ralink: Fix incorrect assignment on ralink_soc [ Upstream commit 08d90c81b714482dceb5323d14f6617bcf55ee61 ] ralink_soc sould be assigned to RT3883_SOC, replace incorrect comparision with assignment. Signed-off-by: Colin Ian King Fixes: 418d29c87061 ("MIPS: ralink: Unify SoC id handling") Cc: John Crispin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/14903/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/ralink/rt3883.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/ralink/rt3883.c b/arch/mips/ralink/rt3883.c index 9e4631acfcb5..3e68e35daf21 100644 --- a/arch/mips/ralink/rt3883.c +++ b/arch/mips/ralink/rt3883.c @@ -145,5 +145,5 @@ void prom_soc_init(struct ralink_soc_info *soc_info) rt2880_pinmux_data = rt3883_pinmux_data; - ralink_soc == RT3883_SOC; + ralink_soc = RT3883_SOC; } From 36c56ac0f8977ac189d27a7aa401ea9dca678c5d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Dec 2016 17:38:50 +0100 Subject: [PATCH 162/555] power: supply: axp288_fuel_gauge: Fix fuel_gauge_reg_readb return on error [ Upstream commit 6f074bc878dc9b00c0df0bf3a8cb1d9e294cd881 ] If reading the register fails, return the actual error code, instead of the uninitialized val variable; Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/power/supply/axp288_fuel_gauge.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/axp288_fuel_gauge.c b/drivers/power/supply/axp288_fuel_gauge.c index 5bdde692f724..f62f9dfea984 100644 --- a/drivers/power/supply/axp288_fuel_gauge.c +++ b/drivers/power/supply/axp288_fuel_gauge.c @@ -169,8 +169,10 @@ static int fuel_gauge_reg_readb(struct axp288_fg_info *info, int reg) break; } - if (ret < 0) + if (ret < 0) { dev_err(&info->pdev->dev, "axp288 reg read err:%d\n", ret); + return ret; + } return val; } From bc9ad17c7af2d04e376246f245a0c95c652db3ce Mon Sep 17 00:00:00 2001 From: Jitendra Bhivare Date: Tue, 13 Dec 2016 15:56:03 +0530 Subject: [PATCH 163/555] scsi: be2iscsi: Add checks to validate CID alloc/free [ Upstream commit 413f365657a8b9669bd0ba3628e9fde9ce63604e ] Set CID slot to 0xffff to indicate empty. Check if connection already exists in conn_table before binding. Check if endpoint already NULL before putting back CID. Break ep->conn link in free_ep to ignore completions after freeing. Signed-off-by: Jitendra Bhivare Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/be2iscsi/be_iscsi.c | 161 ++++++++++++++++--------------- drivers/scsi/be2iscsi/be_main.c | 7 +- drivers/scsi/be2iscsi/be_main.h | 1 + 3 files changed, 86 insertions(+), 83 deletions(-) diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c index ba258217614e..963c732a3d24 100644 --- a/drivers/scsi/be2iscsi/be_iscsi.c +++ b/drivers/scsi/be2iscsi/be_iscsi.c @@ -165,33 +165,6 @@ beiscsi_conn_create(struct iscsi_cls_session *cls_session, u32 cid) return cls_conn; } -/** - * beiscsi_bindconn_cid - Bind the beiscsi_conn with phba connection table - * @beiscsi_conn: The pointer to beiscsi_conn structure - * @phba: The phba instance - * @cid: The cid to free - */ -static int beiscsi_bindconn_cid(struct beiscsi_hba *phba, - struct beiscsi_conn *beiscsi_conn, - unsigned int cid) -{ - uint16_t cri_index = BE_GET_CRI_FROM_CID(cid); - - if (phba->conn_table[cri_index]) { - beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_CONFIG, - "BS_%d : Connection table already occupied. Detected clash\n"); - - return -EINVAL; - } else { - beiscsi_log(phba, KERN_INFO, BEISCSI_LOG_CONFIG, - "BS_%d : phba->conn_table[%d]=%p(beiscsi_conn)\n", - cri_index, beiscsi_conn); - - phba->conn_table[cri_index] = beiscsi_conn; - } - return 0; -} - /** * beiscsi_conn_bind - Binds iscsi session/connection with TCP connection * @cls_session: pointer to iscsi cls session @@ -212,6 +185,7 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session, struct hwi_wrb_context *pwrb_context; struct beiscsi_endpoint *beiscsi_ep; struct iscsi_endpoint *ep; + uint16_t cri_index; ep = iscsi_lookup_endpoint(transport_fd); if (!ep) @@ -229,20 +203,34 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session, return -EEXIST; } - - pwrb_context = &phwi_ctrlr->wrb_context[BE_GET_CRI_FROM_CID( - beiscsi_ep->ep_cid)]; + cri_index = BE_GET_CRI_FROM_CID(beiscsi_ep->ep_cid); + if (phba->conn_table[cri_index]) { + if (beiscsi_conn != phba->conn_table[cri_index] || + beiscsi_ep != phba->conn_table[cri_index]->ep) { + __beiscsi_log(phba, KERN_ERR, + "BS_%d : conn_table not empty at %u: cid %u conn %p:%p\n", + cri_index, + beiscsi_ep->ep_cid, + beiscsi_conn, + phba->conn_table[cri_index]); + return -EINVAL; + } + } beiscsi_conn->beiscsi_conn_cid = beiscsi_ep->ep_cid; beiscsi_conn->ep = beiscsi_ep; beiscsi_ep->conn = beiscsi_conn; + /** + * Each connection is associated with a WRBQ kept in wrb_context. + * Store doorbell offset for transmit path. + */ + pwrb_context = &phwi_ctrlr->wrb_context[cri_index]; beiscsi_conn->doorbell_offset = pwrb_context->doorbell_offset; - beiscsi_log(phba, KERN_INFO, BEISCSI_LOG_CONFIG, - "BS_%d : beiscsi_conn=%p conn=%p ep_cid=%d\n", - beiscsi_conn, conn, beiscsi_ep->ep_cid); - - return beiscsi_bindconn_cid(phba, beiscsi_conn, beiscsi_ep->ep_cid); + "BS_%d : cid %d phba->conn_table[%u]=%p\n", + beiscsi_ep->ep_cid, cri_index, beiscsi_conn); + phba->conn_table[cri_index] = beiscsi_conn; + return 0; } static int beiscsi_iface_create_ipv4(struct beiscsi_hba *phba) @@ -973,9 +961,9 @@ int beiscsi_conn_start(struct iscsi_cls_conn *cls_conn) */ static int beiscsi_get_cid(struct beiscsi_hba *phba) { - unsigned short cid = 0xFFFF, cid_from_ulp; - struct ulp_cid_info *cid_info = NULL; uint16_t cid_avlbl_ulp0, cid_avlbl_ulp1; + unsigned short cid, cid_from_ulp; + struct ulp_cid_info *cid_info; /* Find the ULP which has more CID available */ cid_avlbl_ulp0 = (phba->cid_array_info[BEISCSI_ULP0]) ? @@ -984,20 +972,27 @@ static int beiscsi_get_cid(struct beiscsi_hba *phba) BEISCSI_ULP1_AVLBL_CID(phba) : 0; cid_from_ulp = (cid_avlbl_ulp0 > cid_avlbl_ulp1) ? BEISCSI_ULP0 : BEISCSI_ULP1; + /** + * If iSCSI protocol is loaded only on ULP 0, and when cid_avlbl_ulp + * is ZERO for both, ULP 1 is returned. + * Check if ULP is loaded before getting new CID. + */ + if (!test_bit(cid_from_ulp, (void *)&phba->fw_config.ulp_supported)) + return BE_INVALID_CID; - if (test_bit(cid_from_ulp, (void *)&phba->fw_config.ulp_supported)) { - cid_info = phba->cid_array_info[cid_from_ulp]; - if (!cid_info->avlbl_cids) - return cid; - - cid = cid_info->cid_array[cid_info->cid_alloc++]; - - if (cid_info->cid_alloc == BEISCSI_GET_CID_COUNT( - phba, cid_from_ulp)) - cid_info->cid_alloc = 0; - - cid_info->avlbl_cids--; + cid_info = phba->cid_array_info[cid_from_ulp]; + cid = cid_info->cid_array[cid_info->cid_alloc]; + if (!cid_info->avlbl_cids || cid == BE_INVALID_CID) { + __beiscsi_log(phba, KERN_ERR, + "BS_%d : failed to get cid: available %u:%u\n", + cid_info->avlbl_cids, cid_info->cid_free); + return BE_INVALID_CID; } + /* empty the slot */ + cid_info->cid_array[cid_info->cid_alloc++] = BE_INVALID_CID; + if (cid_info->cid_alloc == BEISCSI_GET_CID_COUNT(phba, cid_from_ulp)) + cid_info->cid_alloc = 0; + cid_info->avlbl_cids--; return cid; } @@ -1008,22 +1003,28 @@ static int beiscsi_get_cid(struct beiscsi_hba *phba) */ static void beiscsi_put_cid(struct beiscsi_hba *phba, unsigned short cid) { - uint16_t cid_post_ulp; - struct hwi_controller *phwi_ctrlr; - struct hwi_wrb_context *pwrb_context; - struct ulp_cid_info *cid_info = NULL; uint16_t cri_index = BE_GET_CRI_FROM_CID(cid); + struct hwi_wrb_context *pwrb_context; + struct hwi_controller *phwi_ctrlr; + struct ulp_cid_info *cid_info; + uint16_t cid_post_ulp; phwi_ctrlr = phba->phwi_ctrlr; pwrb_context = &phwi_ctrlr->wrb_context[cri_index]; cid_post_ulp = pwrb_context->ulp_num; cid_info = phba->cid_array_info[cid_post_ulp]; - cid_info->avlbl_cids++; - + /* fill only in empty slot */ + if (cid_info->cid_array[cid_info->cid_free] != BE_INVALID_CID) { + __beiscsi_log(phba, KERN_ERR, + "BS_%d : failed to put cid %u: available %u:%u\n", + cid, cid_info->avlbl_cids, cid_info->cid_free); + return; + } cid_info->cid_array[cid_info->cid_free++] = cid; if (cid_info->cid_free == BEISCSI_GET_CID_COUNT(phba, cid_post_ulp)) cid_info->cid_free = 0; + cid_info->avlbl_cids++; } /** @@ -1037,8 +1038,8 @@ static void beiscsi_free_ep(struct beiscsi_endpoint *beiscsi_ep) beiscsi_put_cid(phba, beiscsi_ep->ep_cid); beiscsi_ep->phba = NULL; - phba->ep_array[BE_GET_CRI_FROM_CID - (beiscsi_ep->ep_cid)] = NULL; + /* clear this to track freeing in beiscsi_ep_disconnect */ + phba->ep_array[BE_GET_CRI_FROM_CID(beiscsi_ep->ep_cid)] = NULL; /** * Check if any connection resource allocated by driver @@ -1049,6 +1050,11 @@ static void beiscsi_free_ep(struct beiscsi_endpoint *beiscsi_ep) return; beiscsi_conn = beiscsi_ep->conn; + /** + * Break ep->conn link here so that completions after + * this are ignored. + */ + beiscsi_ep->conn = NULL; if (beiscsi_conn->login_in_progress) { beiscsi_free_mgmt_task_handles(beiscsi_conn, beiscsi_conn->task); @@ -1079,7 +1085,7 @@ static int beiscsi_open_conn(struct iscsi_endpoint *ep, "BS_%d : In beiscsi_open_conn\n"); beiscsi_ep->ep_cid = beiscsi_get_cid(phba); - if (beiscsi_ep->ep_cid == 0xFFFF) { + if (beiscsi_ep->ep_cid == BE_INVALID_CID) { beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_CONFIG, "BS_%d : No free cid available\n"); return ret; @@ -1284,26 +1290,6 @@ static int beiscsi_close_conn(struct beiscsi_endpoint *beiscsi_ep, int flag) return ret; } -/** - * beiscsi_unbind_conn_to_cid - Unbind the beiscsi_conn from phba conn table - * @phba: The phba instance - * @cid: The cid to free - */ -static int beiscsi_unbind_conn_to_cid(struct beiscsi_hba *phba, - unsigned int cid) -{ - uint16_t cri_index = BE_GET_CRI_FROM_CID(cid); - - if (phba->conn_table[cri_index]) - phba->conn_table[cri_index] = NULL; - else { - beiscsi_log(phba, KERN_INFO, BEISCSI_LOG_CONFIG, - "BS_%d : Connection table Not occupied.\n"); - return -EINVAL; - } - return 0; -} - /** * beiscsi_ep_disconnect - Tears down the TCP connection * @ep: endpoint to be used @@ -1318,13 +1304,23 @@ void beiscsi_ep_disconnect(struct iscsi_endpoint *ep) unsigned int tag; uint8_t mgmt_invalidate_flag, tcp_upload_flag; unsigned short savecfg_flag = CMD_ISCSI_SESSION_SAVE_CFG_ON_FLASH; + uint16_t cri_index; beiscsi_ep = ep->dd_data; phba = beiscsi_ep->phba; beiscsi_log(phba, KERN_INFO, BEISCSI_LOG_CONFIG, - "BS_%d : In beiscsi_ep_disconnect for ep_cid = %d\n", + "BS_%d : In beiscsi_ep_disconnect for ep_cid = %u\n", beiscsi_ep->ep_cid); + cri_index = BE_GET_CRI_FROM_CID(beiscsi_ep->ep_cid); + if (!phba->ep_array[cri_index]) { + __beiscsi_log(phba, KERN_ERR, + "BS_%d : ep_array at %u cid %u empty\n", + cri_index, + beiscsi_ep->ep_cid); + return; + } + if (beiscsi_ep->conn) { beiscsi_conn = beiscsi_ep->conn; iscsi_suspend_queue(beiscsi_conn->conn); @@ -1356,7 +1352,12 @@ void beiscsi_ep_disconnect(struct iscsi_endpoint *ep) free_ep: msleep(BEISCSI_LOGOUT_SYNC_DELAY); beiscsi_free_ep(beiscsi_ep); - beiscsi_unbind_conn_to_cid(phba, beiscsi_ep->ep_cid); + if (!phba->conn_table[cri_index]) + __beiscsi_log(phba, KERN_ERR, + "BS_%d : conn_table empty at %u: cid %u\n", + cri_index, + beiscsi_ep->ep_cid); + phba->conn_table[cri_index] = NULL; iscsi_destroy_endpoint(beiscsi_ep->openiscsi_ep); } diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index d9239c2d49b1..741cc96379cb 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -4085,9 +4085,10 @@ static int hba_setup_cid_tbls(struct beiscsi_hba *phba) } /* Allocate memory for CID array */ - ptr_cid_info->cid_array = kzalloc(sizeof(void *) * - BEISCSI_GET_CID_COUNT(phba, - ulp_num), GFP_KERNEL); + ptr_cid_info->cid_array = + kcalloc(BEISCSI_GET_CID_COUNT(phba, ulp_num), + sizeof(*ptr_cid_info->cid_array), + GFP_KERNEL); if (!ptr_cid_info->cid_array) { beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_INIT, "BM_%d : Failed to allocate memory" diff --git a/drivers/scsi/be2iscsi/be_main.h b/drivers/scsi/be2iscsi/be_main.h index 6376657e45f7..02d00ab6d981 100644 --- a/drivers/scsi/be2iscsi/be_main.h +++ b/drivers/scsi/be2iscsi/be_main.h @@ -358,6 +358,7 @@ struct beiscsi_hba { unsigned int age; struct list_head hba_queue; #define BE_MAX_SESSION 2048 +#define BE_INVALID_CID 0xffff #define BE_SET_CID_TO_CRI(cri_index, cid) \ (phba->cid_to_cri_map[cid] = cri_index) #define BE_GET_CRI_FROM_CID(cid) (phba->cid_to_cri_map[cid]) From 484e3e7934491fc0a2e72951d3a6eec24d0ad0da Mon Sep 17 00:00:00 2001 From: Marcin Niestroj Date: Fri, 9 Dec 2016 12:33:27 +0100 Subject: [PATCH 164/555] ARM: dts: am335x-chilisom: Wakeup from RTC-only state by power on event [ Upstream commit ca244a83ecc7f0a9242ee2116e622cb6d7ec2a90 ] On chiliSOM TPS65217 nWAKEUP pin is connected to AM335x internal RTC EXT_WAKEUP input. In RTC-only state TPS65217 is notifying about power on events (such as power buton presses) by setting nWAKEUP output low. After that it waits 5s for proper device boot. Currently it doesn't happen, as the processor doesn't listen for such events. Consequently TPS65217 changes state from SLEEP (RTC-only state) to OFF. Enable EXT_WAKEUP input of AM335x's RTC, so the processor can properly detect power on events and recover immediately from RTC-only states, without powering off RTC and losing time. Signed-off-by: Marcin Niestroj Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/am335x-chilisom.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/boot/dts/am335x-chilisom.dtsi b/arch/arm/boot/dts/am335x-chilisom.dtsi index f9ee5859c154..1b43ebd08b38 100644 --- a/arch/arm/boot/dts/am335x-chilisom.dtsi +++ b/arch/arm/boot/dts/am335x-chilisom.dtsi @@ -124,6 +124,14 @@ &rtc { system-power-controller; + + pinctrl-0 = <&ext_wakeup>; + pinctrl-names = "default"; + + ext_wakeup: ext-wakeup { + pins = "ext_wakeup0"; + input-enable; + }; }; /* NAND Flash */ From bc438831606acd445bad9f7df20ac6ce87d5b490 Mon Sep 17 00:00:00 2001 From: Guilherme G Piccoli Date: Thu, 10 Nov 2016 16:46:43 -0200 Subject: [PATCH 165/555] igb: re-assign hw address pointer on reset after PCI error [ Upstream commit 69b97cf6dbce7403845a28bbc75d57f5be7b12ac ] Whenever the igb driver detects the result of a read operation returns a value composed only by F's (like 0xFFFFFFFF), it will detach the net_device, clear the hw_addr pointer and warn to the user that adapter's link is lost - those steps happen on igb_rd32(). In case a PCI error happens on Power architecture, there's a recovery mechanism called EEH, that will reset the PCI slot and call driver's handlers to reset the adapter and network functionality as well. We observed that once hw_addr is NULL after the error is detected on igb_rd32(), it's never assigned back, so in the process of resetting the network functionality we got a NULL pointer dereference in both igb_configure_tx_ring() and igb_configure_rx_ring(). In order to avoid such bug, this patch re-assigns the hw_addr value in the slot_reset handler. Reported-by: Anthony H Thai Reported-by: Harsha Thyagaraja Signed-off-by: Guilherme G Piccoli Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igb/igb_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 9affd7c198bd..6a62447fe377 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -7882,6 +7882,11 @@ static pci_ers_result_t igb_io_slot_reset(struct pci_dev *pdev) pci_enable_wake(pdev, PCI_D3hot, 0); pci_enable_wake(pdev, PCI_D3cold, 0); + /* In case of PCI error, adapter lose its HW address + * so we should re-assign it here. + */ + hw->hw_addr = adapter->io_addr; + igb_reset(adapter); wr32(E1000_WUS, ~0); result = PCI_ERS_RESULT_RECOVERED; From 6798f079b0a54caf69e1bde89cf997a2ade38f2e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 19 Dec 2016 01:13:11 +0100 Subject: [PATCH 166/555] extcon: axp288: Use vbus-valid instead of -present to determine cable presence [ Upstream commit 5757aca10146061befd168dab37fb0db1ccd8f73 ] The vbus-present bit in the power status register also gets set to 1 when a usb-host cable (id-pin shorted to ground) is plugged in and a 5v boost converter is supplying 5v to the otg usb bus. This causes a "disconnect or unknown or ID event" warning in dmesg as well as the extcon device to report the last detected charger cable type as being connected even though none is connected. This commit switches to checking the vbus-valid bit instead, which is only 1 when both vbus is present and the vbus-path is enabled in the vbus-path control register (the vbus-path gets disabled when a usb-host cable is detected, to avoid the pmic drawing power from the 5v boost converter). Signed-off-by: Hans de Goede Acked-by: Chanwoo Choi Signed-off-by: Chanwoo Choi Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/extcon/extcon-axp288.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/extcon/extcon-axp288.c b/drivers/extcon/extcon-axp288.c index 42f41e808292..27f67c28e700 100644 --- a/drivers/extcon/extcon-axp288.c +++ b/drivers/extcon/extcon-axp288.c @@ -168,7 +168,7 @@ static int axp288_handle_chrg_det_event(struct axp288_extcon_info *info) return ret; } - vbus_attach = (pwr_stat & PS_STAT_VBUS_PRESENT); + vbus_attach = (pwr_stat & PS_STAT_VBUS_VALID); if (!vbus_attach) goto notify_otg; From bed7533196b2844088004975279c3674c7c424c5 Mon Sep 17 00:00:00 2001 From: Jiancheng Xue Date: Wed, 30 Nov 2016 09:03:32 +0800 Subject: [PATCH 167/555] reset: ti_syscon: fix a ti_syscon_reset_status issue [ Upstream commit 5987b4bf512101137fa60c5c0ccac3db51541221 ] If STATUS_SET was not set, ti_syscon_reset_status would always return 0 no matter whether the status_bit was set or not. Signed-off-by: Jiancheng Xue Fixes: cc7c2bb1493c ("reset: add TI SYSCON based reset driver") Signed-off-by: Philipp Zabel Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/reset/reset-ti-syscon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/reset/reset-ti-syscon.c b/drivers/reset/reset-ti-syscon.c index 47f0ffd3b013..1799fd423901 100644 --- a/drivers/reset/reset-ti-syscon.c +++ b/drivers/reset/reset-ti-syscon.c @@ -154,8 +154,8 @@ static int ti_syscon_reset_status(struct reset_controller_dev *rcdev, if (ret) return ret; - return (reset_state & BIT(control->status_bit)) && - (control->flags & STATUS_SET); + return !(reset_state & BIT(control->status_bit)) == + !(control->flags & STATUS_SET); } static struct reset_control_ops ti_syscon_reset_ops = { From 81c96182466200aa403fa863ff503af0b275a93f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Mon, 9 Jan 2017 16:34:04 +0100 Subject: [PATCH 168/555] sh_eth: use correct name for ECMR_MPDE bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6dcf45e514974a1ff10755015b5e06746a033e5f ] This bit was wrongly named due to a typo, Sergei checked the SH7734/63 manuals and this bit should be named MPDE. Suggested-by: Sergei Shtylyov Signed-off-by: Niklas Söderlund Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/renesas/sh_eth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h index d050f37f3e0f..5024280f5af2 100644 --- a/drivers/net/ethernet/renesas/sh_eth.h +++ b/drivers/net/ethernet/renesas/sh_eth.h @@ -339,7 +339,7 @@ enum FELIC_MODE_BIT { ECMR_DPAD = 0x00200000, ECMR_RZPF = 0x00100000, ECMR_ZPF = 0x00080000, ECMR_PFR = 0x00040000, ECMR_RXF = 0x00020000, ECMR_TXF = 0x00010000, ECMR_MCT = 0x00002000, ECMR_PRCEF = 0x00001000, - ECMR_PMDE = 0x00000200, ECMR_RE = 0x00000040, ECMR_TE = 0x00000020, + ECMR_MPDE = 0x00000200, ECMR_RE = 0x00000040, ECMR_TE = 0x00000020, ECMR_RTM = 0x00000010, ECMR_ILB = 0x00000008, ECMR_ELB = 0x00000004, ECMR_DM = 0x00000002, ECMR_PRM = 0x00000001, }; From 952d3c52bd8528e5a3ab91c261c37fd9da789f98 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 12 Dec 2016 11:08:55 +0000 Subject: [PATCH 169/555] clk/axs10x: Clear init field in driver probe [ Upstream commit 6205406cf6f282d622f31de25036e6d1ab3a2ff5 ] Init field must be cleared in driver probe as this structure is not dinamically allocated. If not, wrong flags can be passed to core. Signed-off-by: Jose Abreu Cc: Carlos Palminha Cc: Stephen Boyd Cc: Michael Turquette Cc: linux-clk@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 923587aafc2c ("clk/axs10x: Add I2S PLL clock driver") Signed-off-by: Michael Turquette Link: lkml.kernel.org/r/040cc9afdfa0e95ce7a01c406ff427ef7dc0c0fd.1481540717.git.joabreu@synopsys.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/clk/axs10x/i2s_pll_clock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/axs10x/i2s_pll_clock.c b/drivers/clk/axs10x/i2s_pll_clock.c index 411310d29581..02d3bcd6216c 100644 --- a/drivers/clk/axs10x/i2s_pll_clock.c +++ b/drivers/clk/axs10x/i2s_pll_clock.c @@ -182,6 +182,7 @@ static int i2s_pll_clk_probe(struct platform_device *pdev) if (IS_ERR(pll_clk->base)) return PTR_ERR(pll_clk->base); + memset(&init, 0, sizeof(init)); clk_name = node->name; init.name = clk_name; init.ops = &i2s_pll_ops; From d74f860528fbd0cde7127b617d7695bbc68d8e0f Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 20 Dec 2016 19:08:58 +0100 Subject: [PATCH 170/555] usb: make the MTK XHCI driver compile for older MIPS SoCs [ Upstream commit 808cf33d4817c730008de9b2736b357708a3d7f6 ] The MIPS based MT7621 shares the same XHCI core as the newer generation of ARM based SoCs. The driver works out of the box and we only need to make it buildable in Kconfig. Signed-off-by: John Crispin Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig index 0b80cee30da4..eb121b2a55d4 100644 --- a/drivers/usb/host/Kconfig +++ b/drivers/usb/host/Kconfig @@ -45,9 +45,9 @@ config USB_XHCI_PLATFORM If unsure, say N. config USB_XHCI_MTK - tristate "xHCI support for Mediatek MT65xx" + tristate "xHCI support for Mediatek MT65xx/MT7621" select MFD_SYSCON - depends on ARCH_MEDIATEK || COMPILE_TEST + depends on (MIPS && SOC_MT7621) || ARCH_MEDIATEK || COMPILE_TEST ---help--- Say 'Y' to enable the support for the xHCI host controller found in Mediatek MT65xx SoCs. From 772384d7ec40c6a32d04b30143796a51944f56fd Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 27 Dec 2016 14:15:07 -0800 Subject: [PATCH 171/555] hwmon: (gl520sm) Fix overflows and crash seen when writing into limit attributes [ Upstream commit 87cdfa9d60f4f40e6d71b04b10b36d9df3c89282 ] Writes into limit attributes can overflow due to multplications and additions with unbound input values. Writing into fan limit attributes can result in a crash with a division by zero if very large values are written and the fan divider is larger than 1. Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/gl520sm.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/hwmon/gl520sm.c b/drivers/hwmon/gl520sm.c index dee93ec87d02..84e0994aafdd 100644 --- a/drivers/hwmon/gl520sm.c +++ b/drivers/hwmon/gl520sm.c @@ -208,11 +208,13 @@ static ssize_t get_cpu_vid(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(cpu0_vid, S_IRUGO, get_cpu_vid, NULL); -#define VDD_FROM_REG(val) (((val) * 95 + 2) / 4) -#define VDD_TO_REG(val) clamp_val((((val) * 4 + 47) / 95), 0, 255) +#define VDD_FROM_REG(val) DIV_ROUND_CLOSEST((val) * 95, 4) +#define VDD_CLAMP(val) clamp_val(val, 0, 255 * 95 / 4) +#define VDD_TO_REG(val) DIV_ROUND_CLOSEST(VDD_CLAMP(val) * 4, 95) -#define IN_FROM_REG(val) ((val) * 19) -#define IN_TO_REG(val) clamp_val((((val) + 9) / 19), 0, 255) +#define IN_FROM_REG(val) ((val) * 19) +#define IN_CLAMP(val) clamp_val(val, 0, 255 * 19) +#define IN_TO_REG(val) DIV_ROUND_CLOSEST(IN_CLAMP(val), 19) static ssize_t get_in_input(struct device *dev, struct device_attribute *attr, char *buf) @@ -349,8 +351,13 @@ static SENSOR_DEVICE_ATTR(in4_max, S_IRUGO | S_IWUSR, #define DIV_FROM_REG(val) (1 << (val)) #define FAN_FROM_REG(val, div) ((val) == 0 ? 0 : (480000 / ((val) << (div)))) -#define FAN_TO_REG(val, div) ((val) <= 0 ? 0 : \ - clamp_val((480000 + ((val) << ((div)-1))) / ((val) << (div)), 1, 255)) + +#define FAN_BASE(div) (480000 >> (div)) +#define FAN_CLAMP(val, div) clamp_val(val, FAN_BASE(div) / 255, \ + FAN_BASE(div)) +#define FAN_TO_REG(val, div) ((val) == 0 ? 0 : \ + DIV_ROUND_CLOSEST(480000, \ + FAN_CLAMP(val, div) << (div))) static ssize_t get_fan_input(struct device *dev, struct device_attribute *attr, char *buf) @@ -513,9 +520,9 @@ static SENSOR_DEVICE_ATTR(fan2_div, S_IRUGO | S_IWUSR, static DEVICE_ATTR(fan1_off, S_IRUGO | S_IWUSR, get_fan_off, set_fan_off); -#define TEMP_FROM_REG(val) (((val) - 130) * 1000) -#define TEMP_TO_REG(val) clamp_val(((((val) < 0 ? \ - (val) - 500 : (val) + 500) / 1000) + 130), 0, 255) +#define TEMP_FROM_REG(val) (((val) - 130) * 1000) +#define TEMP_CLAMP(val) clamp_val(val, -130000, 125000) +#define TEMP_TO_REG(val) (DIV_ROUND_CLOSEST(TEMP_CLAMP(val), 1000) + 130) static ssize_t get_temp_input(struct device *dev, struct device_attribute *attr, char *buf) From 259f317db758ed4e3f325ea1caa950db5a9cd4ce Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Mon, 2 Jan 2017 09:45:45 -0300 Subject: [PATCH 172/555] iio: adc: imx25-gcq: Fix module autoload [ Upstream commit 8f0d7daf53972da0004f7a5a4d938c85333db300 ] If the driver is built as a module, autoload won't work because the module alias information is not filled. So user-space can't match the registered device with the corresponding module. Export the module alias information using the MODULE_DEVICE_TABLE() macro. Before this patch: $ modinfo drivers/iio/adc/fsl-imx25-gcq.ko | grep alias $ After this patch: $ modinfo drivers/iio/adc/fsl-imx25-gcq.ko | grep alias alias: of:N*T*Cfsl,imx25-gcqC* alias: of:N*T*Cfsl,imx25-gcq Signed-off-by: Javier Martinez Canillas Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/fsl-imx25-gcq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/fsl-imx25-gcq.c b/drivers/iio/adc/fsl-imx25-gcq.c index 72b32c1ab257..ea264fa9e567 100644 --- a/drivers/iio/adc/fsl-imx25-gcq.c +++ b/drivers/iio/adc/fsl-imx25-gcq.c @@ -401,6 +401,7 @@ static const struct of_device_id mx25_gcq_ids[] = { { .compatible = "fsl,imx25-gcq", }, { /* Sentinel */ } }; +MODULE_DEVICE_TABLE(of, mx25_gcq_ids); static struct platform_driver mx25_gcq_driver = { .driver = { From ff9b56037dd7f0c6d46186122473388f1904e445 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Dec 2016 14:55:25 +0100 Subject: [PATCH 173/555] iio: adc: axp288: Drop bogus AXP288_ADC_TS_PIN_CTRL register modifications [ Upstream commit fa2849e9649b5180ffc4cb3c3b005261c403093a ] For some reason the axp288_adc driver was modifying the AXP288_ADC_TS_PIN_CTRL register, changing bits 0-1 depending on whether the GP_ADC channel or another channel was written. These bits control when a bias current is send to the TS_PIN, the GP_ADC has its own pin and a separate bit in another register to control the bias current. Not only does changing when to enable the TS_PIN bias current (always or only when sampling) when reading the GP_ADC make no sense at all, the code is modifying these bits is writing the entire register, assuming that all the other bits have their default value. So if the firmware has configured a different bias-current for either pin, then that change gets clobbered by the write, likewise if the firmware has set bit 2 to indicate that the battery has no thermal sensor, this will get clobbered by the write. This commit fixes all this, by simply removing all writes to the AXP288_ADC_TS_PIN_CTRL register, they are not needed to read the GP_ADC pin, and can actually be harmful. Signed-off-by: Hans de Goede Acked-by: Chen-Yu Tsai Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/axp288_adc.c | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/drivers/iio/adc/axp288_adc.c b/drivers/iio/adc/axp288_adc.c index 7fd24949c0c1..64799ad7ebad 100644 --- a/drivers/iio/adc/axp288_adc.c +++ b/drivers/iio/adc/axp288_adc.c @@ -28,8 +28,6 @@ #include #define AXP288_ADC_EN_MASK 0xF1 -#define AXP288_ADC_TS_PIN_GPADC 0xF2 -#define AXP288_ADC_TS_PIN_ON 0xF3 enum axp288_adc_id { AXP288_ADC_TS, @@ -123,16 +121,6 @@ static int axp288_adc_read_channel(int *val, unsigned long address, return IIO_VAL_INT; } -static int axp288_adc_set_ts(struct regmap *regmap, unsigned int mode, - unsigned long address) -{ - /* channels other than GPADC do not need to switch TS pin */ - if (address != AXP288_GP_ADC_H) - return 0; - - return regmap_write(regmap, AXP288_ADC_TS_PIN_CTRL, mode); -} - static int axp288_adc_read_raw(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, int *val, int *val2, long mask) @@ -143,16 +131,7 @@ static int axp288_adc_read_raw(struct iio_dev *indio_dev, mutex_lock(&indio_dev->mlock); switch (mask) { case IIO_CHAN_INFO_RAW: - if (axp288_adc_set_ts(info->regmap, AXP288_ADC_TS_PIN_GPADC, - chan->address)) { - dev_err(&indio_dev->dev, "GPADC mode\n"); - ret = -EINVAL; - break; - } ret = axp288_adc_read_channel(val, chan->address, info->regmap); - if (axp288_adc_set_ts(info->regmap, AXP288_ADC_TS_PIN_ON, - chan->address)) - dev_err(&indio_dev->dev, "TS pin restore\n"); break; default: ret = -EINVAL; @@ -162,15 +141,6 @@ static int axp288_adc_read_raw(struct iio_dev *indio_dev, return ret; } -static int axp288_adc_set_state(struct regmap *regmap) -{ - /* ADC should be always enabled for internal FG to function */ - if (regmap_write(regmap, AXP288_ADC_TS_PIN_CTRL, AXP288_ADC_TS_PIN_ON)) - return -EIO; - - return regmap_write(regmap, AXP20X_ADC_EN1, AXP288_ADC_EN_MASK); -} - static const struct iio_info axp288_adc_iio_info = { .read_raw = &axp288_adc_read_raw, .driver_module = THIS_MODULE, @@ -199,7 +169,7 @@ static int axp288_adc_probe(struct platform_device *pdev) * Set ADC to enabled state at all time, including system suspend. * otherwise internal fuel gauge functionality may be affected. */ - ret = axp288_adc_set_state(axp20x->regmap); + ret = regmap_write(info->regmap, AXP20X_ADC_EN1, AXP288_ADC_EN_MASK); if (ret) { dev_err(&pdev->dev, "unable to enable ADC device\n"); return ret; From 2b7aec8839dfe6482b8f6e0b8be0951fc990e964 Mon Sep 17 00:00:00 2001 From: Andreas Klinger Date: Thu, 5 Jan 2017 18:51:36 +0100 Subject: [PATCH 174/555] iio: adc: hx711: Add DT binding for avia,hx711 [ Upstream commit ff1293f67734da68e23fecb6ecdae7112b8c43f9 ] Add DT bindings for avia,hx711 Add vendor avia to vendor list Signed-off-by: Andreas Klinger Acked-by: Rob Herring Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/iio/adc/avia-hx711.txt | 18 ++++++++++++++++++ .../devicetree/bindings/vendor-prefixes.txt | 1 + 2 files changed, 19 insertions(+) create mode 100644 Documentation/devicetree/bindings/iio/adc/avia-hx711.txt diff --git a/Documentation/devicetree/bindings/iio/adc/avia-hx711.txt b/Documentation/devicetree/bindings/iio/adc/avia-hx711.txt new file mode 100644 index 000000000000..b3629405f568 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/avia-hx711.txt @@ -0,0 +1,18 @@ +* AVIA HX711 ADC chip for weight cells + Bit-banging driver + +Required properties: + - compatible: Should be "avia,hx711" + - sck-gpios: Definition of the GPIO for the clock + - dout-gpios: Definition of the GPIO for data-out + See Documentation/devicetree/bindings/gpio/gpio.txt + - avdd-supply: Definition of the regulator used as analog supply + +Example: +weight@0 { + compatible = "avia,hx711"; + sck-gpios = <&gpio3 10 GPIO_ACTIVE_HIGH>; + dout-gpios = <&gpio0 7 GPIO_ACTIVE_HIGH>; + avdd-suppy = <&avdd>; +}; + diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index f0a48ea78659..bceffffb7502 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -38,6 +38,7 @@ atmel Atmel Corporation auo AU Optronics Corporation auvidea Auvidea GmbH avago Avago Technologies +avia avia semiconductor avic Shanghai AVIC Optoelectronics Co., Ltd. axis Axis Communications AB boe BOE Technology Group Co., Ltd. From 0081b9e7fcf75e65496f85896d2c3bb42e2a4dd7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 10 Jan 2017 11:15:48 -0800 Subject: [PATCH 175/555] IB/rxe: Add a runtime check in alloc_index() [ Upstream commit 642c7cbcaf2ffc1e27f67eda3dc47347ac5aff37 ] Since index values equal to or above 'range' can trigger memory corruption, complain if index >= range. Signed-off-by: Bart Van Assche Reviewed-by: Andrew Boyer Cc: Moni Shoua Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/sw/rxe/rxe_pool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 6bac0717c540..ee26a1b1b4ed 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -274,6 +274,7 @@ static u32 alloc_index(struct rxe_pool *pool) if (index >= range) index = find_first_zero_bit(pool->table, range); + WARN_ON_ONCE(index >= range); set_bit(index, pool->table); pool->last = index; return index + pool->min_index; From e1c355c244b7045c60510e12429a66a1ae5763df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 10 Jan 2017 11:15:51 -0800 Subject: [PATCH 176/555] IB/rxe: Fix a MR reference leak in check_rkey() [ Upstream commit b3a459961014b14c267544c327db033669493295 ] Avoid that calling check_rkey() for mem->state == RXE_MEM_STATE_FREE triggers an MR reference leak. Signed-off-by: Bart Van Assche Reviewed-by: Andrew Boyer Cc: Moni Shoua Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/sw/rxe/rxe_resp.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index ccf624763565..69ed4e0d7a0d 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -418,7 +418,7 @@ static enum resp_states check_length(struct rxe_qp *qp, static enum resp_states check_rkey(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { - struct rxe_mem *mem; + struct rxe_mem *mem = NULL; u64 va; u32 rkey; u32 resid; @@ -452,38 +452,38 @@ static enum resp_states check_rkey(struct rxe_qp *qp, mem = lookup_mem(qp->pd, access, rkey, lookup_remote); if (!mem) { state = RESPST_ERR_RKEY_VIOLATION; - goto err1; + goto err; } if (unlikely(mem->state == RXE_MEM_STATE_FREE)) { state = RESPST_ERR_RKEY_VIOLATION; - goto err1; + goto err; } if (mem_check_range(mem, va, resid)) { state = RESPST_ERR_RKEY_VIOLATION; - goto err2; + goto err; } if (pkt->mask & RXE_WRITE_MASK) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { state = RESPST_ERR_LENGTH; - goto err2; + goto err; } qp->resp.resid = mtu; } else { if (pktlen != resid) { state = RESPST_ERR_LENGTH; - goto err2; + goto err; } if ((bth_pad(pkt) != (0x3 & (-resid)))) { /* This case may not be exactly that * but nothing else fits. */ state = RESPST_ERR_LENGTH; - goto err2; + goto err; } } } @@ -493,9 +493,9 @@ static enum resp_states check_rkey(struct rxe_qp *qp, qp->resp.mr = mem; return RESPST_EXECUTE; -err2: - rxe_drop_ref(mem); -err1: +err: + if (mem) + rxe_drop_ref(mem); return state; } From e92dca6f5a14310a950b9ca040188d0ce2ba08c9 Mon Sep 17 00:00:00 2001 From: Afzal Mohammed Date: Sat, 7 Jan 2017 17:48:10 +0100 Subject: [PATCH 177/555] ARM: 8635/1: nommu: allow enabling REMAP_VECTORS_TO_RAM [ Upstream commit 8a792e9afbce84a0fdaf213fe42bb97382487094 ] REMAP_VECTORS_TO_RAM depends on DRAM_BASE, but since DRAM_BASE is a hex, REMAP_VECTORS_TO_RAM could never get enabled. Also depending on DRAM_BASE is redundant as whenever REMAP_VECTORS_TO_RAM makes itself available to Kconfig, DRAM_BASE also is available as the Kconfig gets sourced on !MMU. Signed-off-by: Afzal Mohammed Reviewed-by: Vladimir Murzin Signed-off-by: Russell King Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/Kconfig-nommu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/Kconfig-nommu b/arch/arm/Kconfig-nommu index aed66d5df7f1..b7576349528c 100644 --- a/arch/arm/Kconfig-nommu +++ b/arch/arm/Kconfig-nommu @@ -34,8 +34,7 @@ config PROCESSOR_ID used instead of the auto-probing which utilizes the register. config REMAP_VECTORS_TO_RAM - bool 'Install vectors to the beginning of RAM' if DRAM_BASE - depends on DRAM_BASE + bool 'Install vectors to the beginning of RAM' help The kernel needs to change the hardware exception vectors. In nommu mode, the hardware exception vectors are normally From d976d68e17269767a33b4646add0c41a40e7eed5 Mon Sep 17 00:00:00 2001 From: "Nagaraju, Vathsala" Date: Tue, 10 Jan 2017 12:32:26 +0530 Subject: [PATCH 178/555] drm/i915/psr: disable psr2 for resolution greater than 32X20 [ Upstream commit acf45d11050abd751dcec986ab121cb2367dcbba ] PSR2 is restricted to work with panel resolutions upto 3200x2000, move the check to intel_psr_match_conditions and fully block psr. Cc: Rodrigo Vivi Cc: Jim Bride Suggested-by: Rodrigo Vivi Signed-off-by: Vathsala Nagaraju Reviewed-by: Rodrigo Vivi Signed-off-by: Rodrigo Vivi Link: http://patchwork.freedesktop.org/patch/msgid/1484031746-20874-1-git-send-email-vathsala.nagaraju@intel.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_psr.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_psr.c b/drivers/gpu/drm/i915/intel_psr.c index 9b307cee3008..dff478498f05 100644 --- a/drivers/gpu/drm/i915/intel_psr.c +++ b/drivers/gpu/drm/i915/intel_psr.c @@ -387,6 +387,13 @@ static bool intel_psr_match_conditions(struct intel_dp *intel_dp) return false; } + /* PSR2 is restricted to work with panel resolutions upto 3200x2000 */ + if (intel_crtc->config->pipe_src_w > 3200 || + intel_crtc->config->pipe_src_h > 2000) { + dev_priv->psr.psr2_support = false; + return false; + } + dev_priv->psr.source_ok = true; return true; } @@ -425,7 +432,6 @@ void intel_psr_enable(struct intel_dp *intel_dp) struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp); struct drm_device *dev = intel_dig_port->base.base.dev; struct drm_i915_private *dev_priv = to_i915(dev); - struct intel_crtc *crtc = to_intel_crtc(intel_dig_port->base.base.crtc); if (!HAS_PSR(dev)) { DRM_DEBUG_KMS("PSR not supported on this platform\n"); @@ -452,12 +458,7 @@ void intel_psr_enable(struct intel_dp *intel_dp) hsw_psr_setup_vsc(intel_dp); if (dev_priv->psr.psr2_support) { - /* PSR2 is restricted to work with panel resolutions upto 3200x2000 */ - if (crtc->config->pipe_src_w > 3200 || - crtc->config->pipe_src_h > 2000) - dev_priv->psr.psr2_support = false; - else - skl_psr_setup_su_vsc(intel_dp); + skl_psr_setup_su_vsc(intel_dp); } /* From 5d29957578aea3a5d84baf2e5d863f4f5be24222 Mon Sep 17 00:00:00 2001 From: "Matwey V. Kornilov" Date: Thu, 29 Dec 2016 21:48:51 +0300 Subject: [PATCH 179/555] serial: 8250: moxa: Store num_ports in brd [ Upstream commit 9c4b60fe5313c125b1bf68ef04b0010512c27f2d ] When struct moxa8250_board is allocated, then num_ports should be initialized in order to use it later in moxa8250_remove. Signed-off-by: Matwey V. Kornilov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_moxa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/8250/8250_moxa.c b/drivers/tty/serial/8250/8250_moxa.c index 26eb5393a263..d5069b2d4d79 100644 --- a/drivers/tty/serial/8250/8250_moxa.c +++ b/drivers/tty/serial/8250/8250_moxa.c @@ -68,6 +68,7 @@ static int moxa8250_probe(struct pci_dev *pdev, const struct pci_device_id *id) sizeof(unsigned int) * nr_ports, GFP_KERNEL); if (!brd) return -ENOMEM; + brd->num_ports = nr_ports; memset(&uart, 0, sizeof(struct uart_8250_port)); From ca3e4e77201a8e4bb93b2fef4847f2b59ee23f7a Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 9 Jan 2017 01:26:37 +0100 Subject: [PATCH 180/555] tty: goldfish: Fix a parameter of a call to free_irq [ Upstream commit 1a5c2d1de7d35f5eb9793266237903348989502b ] 'request_irq()' and 'free_irq()' should be called with the same dev_id. Signed-off-by: Christophe JAILLET Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/tty/goldfish.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c index 3fc912373adf..996bd473dd03 100644 --- a/drivers/tty/goldfish.c +++ b/drivers/tty/goldfish.c @@ -300,7 +300,7 @@ static int goldfish_tty_probe(struct platform_device *pdev) return 0; err_tty_register_device_failed: - free_irq(irq, pdev); + free_irq(irq, qtty); err_request_irq_failed: goldfish_tty_current_line_count--; if (goldfish_tty_current_line_count == 0) From 6c25cbaff1e9d68e808818526cfabe14aa1d6a50 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Tue, 10 Jan 2017 18:11:29 +0300 Subject: [PATCH 181/555] serial: 8250_port: Remove dangerous pr_debug() [ Upstream commit 699a11ba7ec869b006623182881f2f1f5b4aea53 ] With CONFIG_DYNAMIC_DEBUG if dyndbg enables debug output in 8250_port.c deadlock happens inevitably on UART IRQ handling. That's the problematic execution path: ---------------------------->8------------------------ UART IRQ: serial8250_interrupt() -> serial8250_handle_irq(): lock "port->lock" -> pr_debug() -> serial8250_console_write(): bump in locked "port->lock". OR (if above pr_debug() gets removed): serial8250_tx_chars() -> pr_debug() -> serial8250_console_write(): bump in locked "port->lock". ---------------------------->8------------------------ So let's get rid of those not that much useful debug entries. Discussed problem could be easily reproduced with QEMU for x86_64. As well as this fix could be mimicked with muting of dynamic debug for the problematic lines as simple as: ---------------------------->8------------------------ dyndbg="+p; file 8250_port.c line 1756 -p; file 8250_port.c line 1822 -p" ---------------------------->8------------------------ Signed-off-by: Alexey Brodkin Cc: Jiri Slaby Cc: Peter Hurley Cc: Phillip Raffeck Cc: Anton Wuerfel Cc: "Matwey V. Kornilov" Cc: Yegor Yefremov Cc: Thor Thayer Reviewed-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index f24d3030b98c..1ef31e3ee4a1 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1751,8 +1751,6 @@ void serial8250_tx_chars(struct uart_8250_port *up) if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) uart_write_wakeup(port); - pr_debug("%s: THRE\n", __func__); - /* * With RPM enabled, we have to wait until the FIFO is empty before the * HW can go idle. So we get here once again with empty FIFO and disable @@ -1817,8 +1815,6 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) status = serial_port_in(port, UART_LSR); - pr_debug("%s: status = %x\n", __func__, status); - if (status & (UART_LSR_DR | UART_LSR_BI)) { if (!up->dma || handle_rx_dma(up, iir)) status = serial8250_rx_chars(up, status); From e384bbd585ee1b39c9b4da9141bc7c7215d9c224 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 28 Dec 2016 14:47:22 +0200 Subject: [PATCH 182/555] IB/ipoib: Fix deadlock over vlan_mutex [ Upstream commit 1c3098cdb05207e740715857df7b0998e372f527 ] This patch fixes Deadlock while executing ipoib_vlan_delete. The function takes the vlan_rwsem semaphore and calls unregister_netdevice. The later function calls ipoib_mcast_stop_thread that cause workqueue flush. When the queue has one of the ipoib_ib_dev_flush_xxx events, a deadlock occur because these events also tries to catch the same vlan_rwsem semaphore. To fix, unregister_netdevice should be called after releasing the semaphore. Fixes: cbbe1efa4972 ("IPoIB: Fix deadlock between ipoib_open() and child interface create") Signed-off-by: Feras Daoud Signed-off-by: Erez Shitrit Reviewed-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 57eadd2b7a71..8151d1a6acb1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -193,7 +193,6 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { - unregister_netdevice(priv->dev); list_del(&priv->list); dev = priv->dev; break; @@ -201,6 +200,11 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) } up_write(&ppriv->vlan_rwsem); + if (dev) { + ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); + unregister_netdevice(dev); + } + rtnl_unlock(); if (dev) { From e335016d1f627926a0b0cbeedca059040e7ba64b Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 28 Dec 2016 14:47:24 +0200 Subject: [PATCH 183/555] IB/ipoib: rtnl_unlock can not come after free_netdev [ Upstream commit 89a3987ab7a923c047c6dec008e60ad6f41fac22 ] The ipoib_vlan_add function calls rtnl_unlock after free_netdev, rtnl_unlock not only releases the lock, but also calls netdev_run_todo. The latter function browses the net_todo_list array and completes the unregistration of all its net_device instances. If we call free_netdev before rtnl_unlock, then netdev_run_todo call over the freed device causes panic. To fix, move rtnl_unlock call before free_netdev call. Fixes: 9baa0b036410 ("IB/ipoib: Add rtnl_link_ops support") Cc: Or Gerlitz Signed-off-by: Feras Daoud Signed-off-by: Erez Shitrit Reviewed-by: Yuval Shaia Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 8151d1a6acb1..93b50be14438 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -165,11 +165,11 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) out: up_write(&ppriv->vlan_rwsem); + rtnl_unlock(); + if (result) free_netdev(priv->dev); - rtnl_unlock(); - return result; } From b2e7d1f72b09d18479defc0ab9e709c4b3d21caf Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 28 Dec 2016 14:47:27 +0200 Subject: [PATCH 184/555] IB/ipoib: Replace list_del of the neigh->list with list_del_init [ Upstream commit c586071d1dc8227a7182179b8e50ee92cc43f6d2 ] In order to resolve a situation where a few process delete the same list element in sequence and cause panic, list_del is replaced with list_del_init. In this case if the first process that calls list_del releases the lock before acquiring it again, other processes who can acquire the lock will call list_del_init. Fixes: b63b70d87741 ("IPoIB: Use a private hash table for path lookup") Signed-off-by: Feras Daoud Signed-off-by: Erez Shitrit Reviewed-by: Alex Vesker Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 08c4b0287304..183db0cd849e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1302,7 +1302,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; @@ -1466,7 +1466,7 @@ void ipoib_neigh_free(struct ipoib_neigh *neigh) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { @@ -1551,7 +1551,7 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; @@ -1593,7 +1593,7 @@ static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } From 68b94d6c4edb74bd71d539dffc0bd0ca8a16dea4 Mon Sep 17 00:00:00 2001 From: James Liao Date: Wed, 28 Dec 2016 13:46:45 +0800 Subject: [PATCH 185/555] arm: dts: mt2701: Add subsystem clock controller device nodes [ Upstream commit f235c7e7a75325f28a33559a71f25a0eca6112db ] Add MT2701 subsystem clock controllers, inlcude mmsys, imgsys, vdecsys, hifsys, ethsys and bdpsys. Signed-off-by: James Liao Signed-off-by: Matthias Brugger Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/mt2701.dtsi | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arch/arm/boot/dts/mt2701.dtsi b/arch/arm/boot/dts/mt2701.dtsi index 18596a2c58a1..77c6b931dc24 100644 --- a/arch/arm/boot/dts/mt2701.dtsi +++ b/arch/arm/boot/dts/mt2701.dtsi @@ -174,4 +174,40 @@ clocks = <&uart_clk>; status = "disabled"; }; + + mmsys: syscon@14000000 { + compatible = "mediatek,mt2701-mmsys", "syscon"; + reg = <0 0x14000000 0 0x1000>; + #clock-cells = <1>; + }; + + imgsys: syscon@15000000 { + compatible = "mediatek,mt2701-imgsys", "syscon"; + reg = <0 0x15000000 0 0x1000>; + #clock-cells = <1>; + }; + + vdecsys: syscon@16000000 { + compatible = "mediatek,mt2701-vdecsys", "syscon"; + reg = <0 0x16000000 0 0x1000>; + #clock-cells = <1>; + }; + + hifsys: syscon@1a000000 { + compatible = "mediatek,mt2701-hifsys", "syscon"; + reg = <0 0x1a000000 0 0x1000>; + #clock-cells = <1>; + }; + + ethsys: syscon@1b000000 { + compatible = "mediatek,mt2701-ethsys", "syscon"; + reg = <0 0x1b000000 0 0x1000>; + #clock-cells = <1>; + }; + + bdpsys: syscon@1c000000 { + compatible = "mediatek,mt2701-bdpsys", "syscon"; + reg = <0 0x1c000000 0 0x1000>; + #clock-cells = <1>; + }; }; From 09831a9577669fa16f4369a934e88d8ff644cfe0 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Thu, 1 Dec 2016 16:10:42 +0800 Subject: [PATCH 186/555] drm/amdkfd: fix improper return value on error [ Upstream commit 8bf793883da213864efc50c274d2b38ec0ca58b2 ] In function kfd_wait_on_events(), when the call to copy_from_user() fails, the value of return variable ret is 0. 0 indicates success, which is inconsistent with the execution status. This patch fixes the bug by assigning "-EFAULT" to ret when copy_from_user() returns an unexpected value. Signed-off-by: Pan Bian Signed-off-by: Oded Gabbay Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index a6a4b2b1c0d9..6a3470f84998 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -739,8 +739,10 @@ int kfd_wait_on_events(struct kfd_process *p, struct kfd_event_data event_data; if (copy_from_user(&event_data, &events[i], - sizeof(struct kfd_event_data))) + sizeof(struct kfd_event_data))) { + ret = -EFAULT; goto fail; + } ret = init_event_waiter(p, &event_waiters[i], event_data.event_id, i); From 9553708eb98dd81357eb9be51b6317387d2e7ac4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 12 Jan 2017 14:56:17 +0100 Subject: [PATCH 187/555] USB: serial: mos7720: fix control-message error handling [ Upstream commit 0d130367abf582e7cbf60075c2a7ab53817b1d14 ] Make sure to log an error on short transfers when reading a device register. Also clear the provided buffer (which if often an uninitialised automatic variable) on errors as the driver currently does not bother to check for errors. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7720.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c index 136ff5e1b7c1..135eb04368f9 100644 --- a/drivers/usb/serial/mos7720.c +++ b/drivers/usb/serial/mos7720.c @@ -234,11 +234,16 @@ static int read_mos_reg(struct usb_serial *serial, unsigned int serial_portnum, status = usb_control_msg(usbdev, pipe, request, requesttype, value, index, buf, 1, MOS_WDR_TIMEOUT); - if (status == 1) + if (status == 1) { *data = *buf; - else if (status < 0) + } else { dev_err(&usbdev->dev, "mos7720: usb_control_msg() failed: %d\n", status); + if (status >= 0) + status = -EIO; + *data = 0; + } + kfree(buf); return status; From c6d263e6b30af4040a61bcceef2fdae33711ad0d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 12 Jan 2017 14:56:18 +0100 Subject: [PATCH 188/555] USB: serial: mos7840: fix control-message error handling [ Upstream commit cd8db057e93ddaacbec025b567490555d2bca280 ] Make sure to detect short transfers when reading a device register. The modem-status handling had sufficient error checks in place, but move handling of short transfers into the register accessor function itself for consistency. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7840.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index 5c4fc3abf6a7..6baacf64b21e 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -285,9 +285,15 @@ static int mos7840_get_reg_sync(struct usb_serial_port *port, __u16 reg, ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, MCS_RD_RTYPE, 0, reg, buf, VENDOR_READ_LENGTH, MOS_WDR_TIMEOUT); + if (ret < VENDOR_READ_LENGTH) { + if (ret >= 0) + ret = -EIO; + goto out; + } + *val = buf[0]; dev_dbg(&port->dev, "%s offset is %x, return val %x\n", __func__, reg, *val); - +out: kfree(buf); return ret; } @@ -353,8 +359,13 @@ static int mos7840_get_uart_reg(struct usb_serial_port *port, __u16 reg, ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, MCS_RD_RTYPE, Wval, reg, buf, VENDOR_READ_LENGTH, MOS_WDR_TIMEOUT); + if (ret < VENDOR_READ_LENGTH) { + if (ret >= 0) + ret = -EIO; + goto out; + } *val = buf[0]; - +out: kfree(buf); return ret; } @@ -1490,10 +1501,10 @@ static int mos7840_tiocmget(struct tty_struct *tty) return -ENODEV; status = mos7840_get_uart_reg(port, MODEM_STATUS_REGISTER, &msr); - if (status != 1) + if (status < 0) return -EIO; status = mos7840_get_uart_reg(port, MODEM_CONTROL_REGISTER, &mcr); - if (status != 1) + if (status < 0) return -EIO; result = ((mcr & MCR_DTR) ? TIOCM_DTR : 0) | ((mcr & MCR_RTS) ? TIOCM_RTS : 0) From eaf9616e406c05c30bb7366be4aae631a715c7ae Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 13 Jan 2017 21:20:29 +0000 Subject: [PATCH 189/555] sfc: get PIO buffer size from the NIC [ Upstream commit c634700f7eec3c0da46e299cd0a0ae8b594f9b55 ] The 8000 series SFC NICs have 4K PIO buffers, rather than the 2K of the 7000 series. Rather than having a hard-coded PIO buffer size (ER_DZ_TX_PIOBUF_SIZE), read it from the GET_CAPABILITIES_V2 MCDI response. Signed-off-by: Edward Cree Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/sfc/ef10.c | 16 ++++++++++------ drivers/net/ethernet/sfc/nic.h | 2 ++ drivers/net/ethernet/sfc/tx.c | 1 - 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index c4ada7227f40..1d85109cb8ed 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -197,11 +197,15 @@ static int efx_ef10_init_datapath_caps(struct efx_nic *efx) nic_data->datapath_caps = MCDI_DWORD(outbuf, GET_CAPABILITIES_OUT_FLAGS1); - if (outlen >= MC_CMD_GET_CAPABILITIES_V2_OUT_LEN) + if (outlen >= MC_CMD_GET_CAPABILITIES_V2_OUT_LEN) { nic_data->datapath_caps2 = MCDI_DWORD(outbuf, GET_CAPABILITIES_V2_OUT_FLAGS2); - else + nic_data->piobuf_size = MCDI_WORD(outbuf, + GET_CAPABILITIES_V2_OUT_SIZE_PIO_BUFF); + } else { nic_data->datapath_caps2 = 0; + nic_data->piobuf_size = ER_DZ_TX_PIOBUF_SIZE; + } /* record the DPCPU firmware IDs to determine VEB vswitching support. */ @@ -825,8 +829,8 @@ static int efx_ef10_link_piobufs(struct efx_nic *efx) offset = ((efx->tx_channel_offset + efx->n_tx_channels - tx_queue->channel->channel - 1) * efx_piobuf_size); - index = offset / ER_DZ_TX_PIOBUF_SIZE; - offset = offset % ER_DZ_TX_PIOBUF_SIZE; + index = offset / nic_data->piobuf_size; + offset = offset % nic_data->piobuf_size; /* When the host page size is 4K, the first * host page in the WC mapping may be within @@ -1161,11 +1165,11 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) * functions of the controller. */ if (efx_piobuf_size != 0 && - ER_DZ_TX_PIOBUF_SIZE / efx_piobuf_size * EF10_TX_PIOBUF_COUNT >= + nic_data->piobuf_size / efx_piobuf_size * EF10_TX_PIOBUF_COUNT >= efx->n_tx_channels) { unsigned int n_piobufs = DIV_ROUND_UP(efx->n_tx_channels, - ER_DZ_TX_PIOBUF_SIZE / efx_piobuf_size); + nic_data->piobuf_size / efx_piobuf_size); rc = efx_ef10_alloc_piobufs(efx, n_piobufs); if (rc) diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h index 73bee7ea332a..73028f21a2d7 100644 --- a/drivers/net/ethernet/sfc/nic.h +++ b/drivers/net/ethernet/sfc/nic.h @@ -500,6 +500,7 @@ enum { * @pio_write_base: Base address for writing PIO buffers * @pio_write_vi_base: Relative VI number for @pio_write_base * @piobuf_handle: Handle of each PIO buffer allocated + * @piobuf_size: size of a single PIO buffer * @must_restore_piobufs: Flag: PIO buffers have yet to be restored after MC * reboot * @rx_rss_context: Firmware handle for our RSS context @@ -537,6 +538,7 @@ struct efx_ef10_nic_data { void __iomem *wc_membase, *pio_write_base; unsigned int pio_write_vi_base; unsigned int piobuf_handle[EF10_TX_PIOBUF_COUNT]; + u16 piobuf_size; bool must_restore_piobufs; u32 rx_rss_context; bool rx_rss_context_exclusive; diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index 233778911557..6f26acd0aa61 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -27,7 +27,6 @@ #ifdef EFX_USE_PIO -#define EFX_PIOBUF_SIZE_MAX ER_DZ_TX_PIOBUF_SIZE #define EFX_PIOBUF_SIZE_DEF ALIGN(256, L1_CACHE_BYTES) unsigned int efx_piobuf_size __read_mostly = EFX_PIOBUF_SIZE_DEF; From 0fc89de6ee777cf48a291505164da2c5eef17311 Mon Sep 17 00:00:00 2001 From: Alden Tondettar Date: Sun, 15 Jan 2017 15:31:56 -0700 Subject: [PATCH 190/555] partitions/efi: Fix integer overflow in GPT size calculation [ Upstream commit c5082b70adfe8e1ea1cf4a8eff92c9f260e364d2 ] If a GUID Partition Table claims to have more than 2**25 entries, the calculation of the partition table size in alloc_read_gpt_entries() will overflow a 32-bit integer and not enough space will be allocated for the table. Nothing seems to get written out of bounds, but later efi_partition() will read up to 32768 bytes from a 128 byte buffer, possibly OOPSing or exposing information to /proc/partitions and uevents. The problem exists on both 64-bit and 32-bit platforms. Fix the overflow and also print a meaningful debug message if the table size is too large. Signed-off-by: Alden Tondettar Acked-by: Ard Biesheuvel Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- block/partitions/efi.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index bcd86e5cd546..39f70d968754 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -293,7 +293,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, if (!gpt) return NULL; - count = le32_to_cpu(gpt->num_partition_entries) * + count = (size_t)le32_to_cpu(gpt->num_partition_entries) * le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; @@ -352,7 +352,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; - u64 lastlba; + u64 lastlba, pt_size; if (!ptes) return 0; @@ -434,13 +434,20 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, goto fail; } + /* Sanity check partition table size */ + pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * + le32_to_cpu((*gpt)->sizeof_partition_entry); + if (pt_size > KMALLOC_MAX_SIZE) { + pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", + (unsigned long long)pt_size, KMALLOC_MAX_SIZE); + goto fail; + } + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ - crc = efi_crc32((const unsigned char *) (*ptes), - le32_to_cpu((*gpt)->num_partition_entries) * - le32_to_cpu((*gpt)->sizeof_partition_entry)); + crc = efi_crc32((const unsigned char *) (*ptes), pt_size); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { pr_debug("GUID Partition Entry Array CRC check failed.\n"); From de415c812ec9e8d44d2548f304e6da018121c000 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 13 Jan 2017 10:23:52 +0100 Subject: [PATCH 191/555] ASoC: dapm: handle probe deferrals [ Upstream commit 37e1df8c95e2c8a57c77eafc097648f6e40a60ff ] This starts to handle probe deferrals on regulators and clocks on the ASoC DAPM. I came to this patch after audio stopped working on Ux500 ages ago and I finally looked into it to see what is wrong. I had messages like this in the console since a while back: ab8500-codec.0: ASoC: Failed to request audioclk: -517 ab8500-codec.0: ASoC: Failed to create DAPM control audioclk ab8500-codec.0: Failed to create new controls -12 snd-soc-mop500.0: ASoC: failed to instantiate card -12 snd-soc-mop500.0: Error: snd_soc_register_card failed (-12)! snd-soc-mop500: probe of snd-soc-mop500.0 failed with error -12 Apparently because the widget table for the codec looks like this (sound/soc/codecs/ab8500-codec.c): static const struct snd_soc_dapm_widget ab8500_dapm_widgets[] = { /* Clocks */ SND_SOC_DAPM_CLOCK_SUPPLY("audioclk"), /* Regulators */ SND_SOC_DAPM_REGULATOR_SUPPLY("V-AUD", 0, 0), SND_SOC_DAPM_REGULATOR_SUPPLY("V-AMIC1", 0, 0), SND_SOC_DAPM_REGULATOR_SUPPLY("V-AMIC2", 0, 0), SND_SOC_DAPM_REGULATOR_SUPPLY("V-DMIC", 0, 0), So when we call snd_soc_register_codec() and any of these widgets get a deferred probe we do not get an -EPROBE_DEFER (-517) back as we should and instead we just fail. Apparently the code assumes that clocks and regulators must be available at this point and not defer. After this patch it rather looks like this: ab8500-codec.0: Failed to create new controls -517 snd-soc-mop500.0: ASoC: failed to instantiate card -517 snd-soc-mop500.0: Error: snd_soc_register_card failed (-517)! (...) abx500-clk.0: registered clocks for ab850x snd-soc-mop500.0: ab8500-codec-dai.0 <-> ux500-msp-i2s.1 mapping ok snd-soc-mop500.0: ab8500-codec-dai.1 <-> ux500-msp-i2s.3 mapping ok I'm pretty happy about the patch as it it, but I'm a bit uncertain on how to proceed: there are a lot of users of the external functions snd_soc_dapm_new_control() (111 sites) and that will now return an occassional error pointer, which is not handled in the calling sites. I want an indication from the maintainers whether I should just go in and augment all these call sites, or if deferred probe is frowned upon when it leads to this much overhead. Signed-off-by: Linus Walleij Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-dapm.c | 42 ++++++++++++++++++++++++++++++++++++++++ sound/soc/soc-topology.c | 9 +++++++++ 2 files changed, 51 insertions(+) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 3bbe32ee4630..411f7574dd0b 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -358,6 +358,10 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget, snd_soc_dapm_new_control_unlocked(widget->dapm, &template); kfree(name); + if (IS_ERR(data->widget)) { + ret = PTR_ERR(data->widget); + goto err_data; + } if (!data->widget) { ret = -ENOMEM; goto err_data; @@ -392,6 +396,10 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget, data->widget = snd_soc_dapm_new_control_unlocked( widget->dapm, &template); kfree(name); + if (IS_ERR(data->widget)) { + ret = PTR_ERR(data->widget); + goto err_data; + } if (!data->widget) { ret = -ENOMEM; goto err_data; @@ -3311,11 +3319,22 @@ snd_soc_dapm_new_control(struct snd_soc_dapm_context *dapm, mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME); w = snd_soc_dapm_new_control_unlocked(dapm, widget); + /* Do not nag about probe deferrals */ + if (IS_ERR(w)) { + int ret = PTR_ERR(w); + + if (ret != -EPROBE_DEFER) + dev_err(dapm->dev, + "ASoC: Failed to create DAPM control %s (%d)\n", + widget->name, ret); + goto out_unlock; + } if (!w) dev_err(dapm->dev, "ASoC: Failed to create DAPM control %s\n", widget->name); +out_unlock: mutex_unlock(&dapm->card->dapm_mutex); return w; } @@ -3338,6 +3357,8 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, w->regulator = devm_regulator_get(dapm->dev, w->name); if (IS_ERR(w->regulator)) { ret = PTR_ERR(w->regulator); + if (ret == -EPROBE_DEFER) + return ERR_PTR(ret); dev_err(dapm->dev, "ASoC: Failed to request %s: %d\n", w->name, ret); return NULL; @@ -3356,6 +3377,8 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, w->clk = devm_clk_get(dapm->dev, w->name); if (IS_ERR(w->clk)) { ret = PTR_ERR(w->clk); + if (ret == -EPROBE_DEFER) + return ERR_PTR(ret); dev_err(dapm->dev, "ASoC: Failed to request %s: %d\n", w->name, ret); return NULL; @@ -3474,6 +3497,16 @@ int snd_soc_dapm_new_controls(struct snd_soc_dapm_context *dapm, mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_INIT); for (i = 0; i < num; i++) { w = snd_soc_dapm_new_control_unlocked(dapm, widget); + if (IS_ERR(w)) { + ret = PTR_ERR(w); + /* Do not nag about probe deferrals */ + if (ret == -EPROBE_DEFER) + break; + dev_err(dapm->dev, + "ASoC: Failed to create DAPM control %s (%d)\n", + widget->name, ret); + break; + } if (!w) { dev_err(dapm->dev, "ASoC: Failed to create DAPM control %s\n", @@ -3750,6 +3783,15 @@ int snd_soc_dapm_new_pcm(struct snd_soc_card *card, dev_dbg(card->dev, "ASoC: adding %s widget\n", link_name); w = snd_soc_dapm_new_control_unlocked(&card->dapm, &template); + if (IS_ERR(w)) { + ret = PTR_ERR(w); + /* Do not nag about probe deferrals */ + if (ret != -EPROBE_DEFER) + dev_err(card->dev, + "ASoC: Failed to create %s widget (%d)\n", + link_name, ret); + goto outfree_kcontrol_news; + } if (!w) { dev_err(card->dev, "ASoC: Failed to create %s widget\n", link_name); diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 6b05047a4134..8a758c994506 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1473,6 +1473,15 @@ widget: widget = snd_soc_dapm_new_control(dapm, &template); else widget = snd_soc_dapm_new_control_unlocked(dapm, &template); + if (IS_ERR(widget)) { + ret = PTR_ERR(widget); + /* Do not nag about probe deferrals */ + if (ret != -EPROBE_DEFER) + dev_err(tplg->dev, + "ASoC: failed to create widget %s controls (%d)\n", + w->name, ret); + goto hdr_err; + } if (widget == NULL) { dev_err(tplg->dev, "ASoC: failed to create widget %s controls\n", w->name); From 91e66498a96a1cfac4b7c0c82b0027232856623c Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 17 Jan 2017 11:07:15 -0500 Subject: [PATCH 192/555] audit: log 32-bit socketcalls [ Upstream commit 62bc306e2083436675e33b5bdeb6a77907d35971 ] 32-bit socketcalls were not being logged by audit on x86_64 systems. Log them. This is basically a duplicate of the call from net/socket.c:sys_socketcall(), but it addresses the impedance mismatch between 32-bit userspace process and 64-bit kernel audit. See: https://github.com/linux-audit/audit-kernel/issues/14 Signed-off-by: Richard Guy Briggs Acked-by: David S. Miller Signed-off-by: Paul Moore Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/audit.h | 20 ++++++++++++++++++++ net/compat.c | 17 ++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index 9d4443f93db6..2be99b276d29 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -387,6 +387,20 @@ static inline int audit_socketcall(int nargs, unsigned long *args) return __audit_socketcall(nargs, args); return 0; } + +static inline int audit_socketcall_compat(int nargs, u32 *args) +{ + unsigned long a[AUDITSC_ARGS]; + int i; + + if (audit_dummy_context()) + return 0; + + for (i = 0; i < nargs; i++) + a[i] = (unsigned long)args[i]; + return __audit_socketcall(nargs, a); +} + static inline int audit_sockaddr(int len, void *addr) { if (unlikely(!audit_dummy_context())) @@ -513,6 +527,12 @@ static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; } + +static inline int audit_socketcall_compat(int nargs, u32 *args) +{ + return 0; +} + static inline void audit_fd_pair(int fd1, int fd2) { } static inline int audit_sockaddr(int len, void *addr) diff --git a/net/compat.c b/net/compat.c index 1cd2ec046164..a96fd2f3507b 100644 --- a/net/compat.c +++ b/net/compat.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -781,14 +782,24 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) { - int ret; - u32 a[6]; + u32 a[AUDITSC_ARGS]; + unsigned int len; u32 a0, a1; + int ret; if (call < SYS_SOCKET || call > SYS_SENDMMSG) return -EINVAL; - if (copy_from_user(a, args, nas[call])) + len = nas[call]; + if (len > sizeof(a)) + return -EINVAL; + + if (copy_from_user(a, args, len)) return -EFAULT; + + ret = audit_socketcall_compat(len / sizeof(a[0]), a); + if (ret) + return ret; + a0 = a[0]; a1 = a[1]; From 874b5acede7892b24cb883b61c1aee9d8842fd0f Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Thu, 12 Jan 2017 16:14:30 +0100 Subject: [PATCH 193/555] ath10k: prevent sta pointer rcu violation [ Upstream commit 0a744d927406389e00687560d9ce3c5ab0e58db9 ] Station pointers are RCU protected so driver must be extra careful if it tries to store them internally for later use outside of the RCU section it obtained it in. It was possible for station teardown to race with some htt events. The possible outcome could be a use-after-free and a crash. Only peer-flow-control capable firmware was affected (so hardware-wise qca99x0 and qca4019). This could be done in sta_state() itself via explicit synchronize_net() call but there's already a convenient sta_pre_rcu_remove() op that can be hooked up to avoid extra rcu stall. The peer->sta pointer itself can't be set to NULL/ERR_PTR because it is later used in sta_state() for extra sanity checks. Signed-off-by: Michal Kazior Signed-off-by: Kalle Valo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/core.h | 1 + drivers/net/wireless/ath/ath10k/mac.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index be5b527472f9..90c0c4a7175d 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -314,6 +314,7 @@ struct ath10k_peer { struct ieee80211_vif *vif; struct ieee80211_sta *sta; + bool removed; int vdev_id; u8 addr[ETH_ALEN]; DECLARE_BITMAP(peer_ids, ATH10K_MAX_NUM_PEER_IDS); diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index f2e85eb22afe..30e98afa2e68 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -3738,6 +3738,9 @@ struct ieee80211_txq *ath10k_mac_txq_lookup(struct ath10k *ar, if (!peer) return NULL; + if (peer->removed) + return NULL; + if (peer->sta) return peer->sta->txq[tid]; else if (peer->vif) @@ -7422,6 +7425,20 @@ ath10k_mac_op_switch_vif_chanctx(struct ieee80211_hw *hw, return 0; } +static void ath10k_mac_op_sta_pre_rcu_remove(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta) +{ + struct ath10k *ar; + struct ath10k_peer *peer; + + ar = hw->priv; + + list_for_each_entry(peer, &ar->peers, list) + if (peer->sta == sta) + peer->removed = true; +} + static const struct ieee80211_ops ath10k_ops = { .tx = ath10k_mac_op_tx, .wake_tx_queue = ath10k_mac_op_wake_tx_queue, @@ -7462,6 +7479,7 @@ static const struct ieee80211_ops ath10k_ops = { .assign_vif_chanctx = ath10k_mac_op_assign_vif_chanctx, .unassign_vif_chanctx = ath10k_mac_op_unassign_vif_chanctx, .switch_vif_chanctx = ath10k_mac_op_switch_vif_chanctx, + .sta_pre_rcu_remove = ath10k_mac_op_sta_pre_rcu_remove, CFG80211_TESTMODE_CMD(ath10k_tm_cmd) From 4af5e6136d766aef735f9b65534a0c4c4ca35ad4 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 19 Jan 2017 16:25:21 +0200 Subject: [PATCH 194/555] spi: pxa2xx: Add support for Intel Gemini Lake [ Upstream commit e18a80acd1365e91e3efcd69942d9073936cf851 ] Gemini Lake reuses the same LPSS SPI configuration as Broxton Signed-off-by: David E. Box Signed-off-by: Jarkko Nikula Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-pxa2xx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index d6239fa718be..3f3751e2b521 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -1458,6 +1458,10 @@ static const struct pci_device_id pxa2xx_spi_pci_compound_match[] = { { PCI_VDEVICE(INTEL, 0x1ac2), LPSS_BXT_SSP }, { PCI_VDEVICE(INTEL, 0x1ac4), LPSS_BXT_SSP }, { PCI_VDEVICE(INTEL, 0x1ac6), LPSS_BXT_SSP }, + /* GLK */ + { PCI_VDEVICE(INTEL, 0x31c2), LPSS_BXT_SSP }, + { PCI_VDEVICE(INTEL, 0x31c4), LPSS_BXT_SSP }, + { PCI_VDEVICE(INTEL, 0x31c6), LPSS_BXT_SSP }, /* APL */ { PCI_VDEVICE(INTEL, 0x5ac2), LPSS_BXT_SSP }, { PCI_VDEVICE(INTEL, 0x5ac4), LPSS_BXT_SSP }, From 75d1888ddce9dbe8774755735ee235dd03b2dd33 Mon Sep 17 00:00:00 2001 From: Sricharan R Date: Fri, 6 Jan 2017 18:58:15 +0530 Subject: [PATCH 195/555] iommu/arm-smmu: Set privileged attribute to 'default' instead of 'unprivileged' [ Upstream commit e19898077cfb642fe151ba22981e795c74d9e114 ] Currently the driver sets all the device transactions privileges to UNPRIVILEGED, but there are cases where the iommu masters wants to isolate privileged supervisor and unprivileged user. So don't override the privileged setting to unprivileged, instead set it to default as incoming and let it be controlled by the pagetable settings. Acked-by: Will Deacon Signed-off-by: Sricharan R Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/arm-smmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 8f7281444551..5a9a4416f467 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1211,7 +1211,7 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain, continue; s2cr[idx].type = type; - s2cr[idx].privcfg = S2CR_PRIVCFG_UNPRIV; + s2cr[idx].privcfg = S2CR_PRIVCFG_DEFAULT; s2cr[idx].cbndx = cbndx; arm_smmu_write_s2cr(smmu, idx); } From 3f22900466a1e51df3da33c4ca338aa3d0632274 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Wed, 19 Oct 2016 15:32:58 +0800 Subject: [PATCH 196/555] usb: chipidea: vbus event may exist before starting gadget [ Upstream commit c3b674a04b8ab62a1d35e86714d466af0a0ecc18 ] At some situations, the vbus may already be there before starting gadget. So we need to check vbus event after switching to gadget in order to handle missing vbus event. The typical use cases are plugging vbus cable before driver load or the vbus has already been there after stopping host but before starting gadget. Signed-off-by: Peter Chen Tested-by: Stephen Boyd Reported-by: Stephen Boyd Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/otg.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/usb/chipidea/otg.c b/drivers/usb/chipidea/otg.c index 0cf149edddd8..f36a1ac3bfbd 100644 --- a/drivers/usb/chipidea/otg.c +++ b/drivers/usb/chipidea/otg.c @@ -134,9 +134,9 @@ void ci_handle_vbus_change(struct ci_hdrc *ci) if (!ci->is_otg) return; - if (hw_read_otgsc(ci, OTGSC_BSV)) + if (hw_read_otgsc(ci, OTGSC_BSV) && !ci->vbus_active) usb_gadget_vbus_connect(&ci->gadget); - else + else if (!hw_read_otgsc(ci, OTGSC_BSV) && ci->vbus_active) usb_gadget_vbus_disconnect(&ci->gadget); } @@ -175,14 +175,21 @@ static void ci_handle_id_switch(struct ci_hdrc *ci) ci_role_stop(ci); - if (role == CI_ROLE_GADGET) + if (role == CI_ROLE_GADGET && + IS_ERR(ci->platdata->vbus_extcon.edev)) /* - * wait vbus lower than OTGSC_BSV before connecting - * to host + * Wait vbus lower than OTGSC_BSV before connecting + * to host. If connecting status is from an external + * connector instead of register, we don't need to + * care vbus on the board, since it will not affect + * external connector status. */ hw_wait_vbus_lower_bsv(ci); ci_role_start(ci, role); + /* vbus change may have already occurred */ + if (role == CI_ROLE_GADGET) + ci_handle_vbus_change(ci); } } /** From 4302bc4f40b1ed8328e0cb358d1d34f56b6b47c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Axel=20K=C3=B6llhofer?= Date: Tue, 17 Jan 2017 18:18:55 -0500 Subject: [PATCH 197/555] rtl8xxxu: Add additional USB IDs for rtl8192eu devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 5407fd7de69f3352aed659244d4bef18e3cabf5c ] These IDs originate from the vendor driver Signed-off-by: Axel Köllhofer Signed-off-by: Jes Sorensen Signed-off-by: Kalle Valo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c index 82d949ede294..4e725d165aa6 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c @@ -6316,6 +6316,13 @@ static struct usb_device_id dev_table[] = { .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x7392, 0x7822, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +/* found in rtl8192eu vendor driver */ +{USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x0107, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192eu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x2019, 0xab33, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192eu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(USB_VENDOR_ID_REALTEK, 0x818c, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192eu_fops}, #endif { } }; From e2d1a42ed06e98fee10fc3d5be30f9cb06a6dfc0 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 20 Jan 2017 14:07:52 +0100 Subject: [PATCH 198/555] ASoC: dapm: fix some pointer error handling [ Upstream commit 639467c8f26d834c934215e8b59129ce442475fe ] commit 66feeec9322132689d42723df2537d60f96f8e44 "RFC: ASoC: dapm: handle probe deferrals" forgot a to update some two sites where the call was used. The static codechecks quickly found them. Reported-by: Dan Carpenter Fixes: 66feeec93221 ("RFC: ASoC: dapm: handle probe deferrals") Signed-off-by: Linus Walleij Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-dapm.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 411f7574dd0b..6780eba55ec2 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3843,6 +3843,16 @@ int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, template.name); w = snd_soc_dapm_new_control_unlocked(dapm, &template); + if (IS_ERR(w)) { + int ret = PTR_ERR(w); + + /* Do not nag about probe deferrals */ + if (ret != -EPROBE_DEFER) + dev_err(dapm->dev, + "ASoC: Failed to create %s widget (%d)\n", + dai->driver->playback.stream_name, ret); + return ret; + } if (!w) { dev_err(dapm->dev, "ASoC: Failed to create %s widget\n", dai->driver->playback.stream_name); @@ -3862,6 +3872,16 @@ int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, template.name); w = snd_soc_dapm_new_control_unlocked(dapm, &template); + if (IS_ERR(w)) { + int ret = PTR_ERR(w); + + /* Do not nag about probe deferrals */ + if (ret != -EPROBE_DEFER) + dev_err(dapm->dev, + "ASoC: Failed to create %s widget (%d)\n", + dai->driver->playback.stream_name, ret); + return ret; + } if (!w) { dev_err(dapm->dev, "ASoC: Failed to create %s widget\n", dai->driver->capture.stream_name); From c6737116517005b7b4d38821b21841a92125effb Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Wed, 7 Dec 2016 13:17:21 +0000 Subject: [PATCH 199/555] drm: mali-dp: Fix destination size handling when rotating [ Upstream commit edabb3c4cd2d035bc93a3d67b25a304ea6217301 ] The destination rectangle provided by userspace in the CRTC_X/Y/W/H properties is already expressed as the dimensions after rotation. This means we shouldn't swap the width and height ourselves when a 90/270 degree rotation is requested, so remove the code doing the swap. Fixes: ad49f8602fe8 ("drm/arm: Add support for Mali Display Processors") Signed-off-by: Brian Starkey Signed-off-by: Liviu Dudau Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/arm/malidp_planes.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/arm/malidp_planes.c b/drivers/gpu/drm/arm/malidp_planes.c index 82c193e5e0d6..a6bdd91d6c9e 100644 --- a/drivers/gpu/drm/arm/malidp_planes.c +++ b/drivers/gpu/drm/arm/malidp_planes.c @@ -150,13 +150,8 @@ static void malidp_de_plane_update(struct drm_plane *plane, /* convert src values from Q16 fixed point to integer */ src_w = plane->state->src_w >> 16; src_h = plane->state->src_h >> 16; - if (plane->state->rotation & MALIDP_ROTATED_MASK) { - dest_w = plane->state->crtc_h; - dest_h = plane->state->crtc_w; - } else { - dest_w = plane->state->crtc_w; - dest_h = plane->state->crtc_h; - } + dest_w = plane->state->crtc_w; + dest_h = plane->state->crtc_h; malidp_hw_write(mp->hwdev, format_id, mp->layer->base); From d621f970fd71d30a84199f1890cddcbd503829bc Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Wed, 7 Dec 2016 13:20:28 +0000 Subject: [PATCH 200/555] drm: mali-dp: Fix transposed horizontal/vertical flip [ Upstream commit 7916efe5b57505080b3cebf5bdb228b4eda008ea ] The horizontal and vertical flip flags were the wrong way around, causing reflect-x to result in reflect-y being applied and vice-versa. Fix them. Fixes: ad49f8602fe8 ("drm/arm: Add support for Mali Display Processors") Signed-off-by: Brian Starkey Signed-off-by: Liviu Dudau Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/arm/malidp_planes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/arm/malidp_planes.c b/drivers/gpu/drm/arm/malidp_planes.c index a6bdd91d6c9e..afe0480d95c9 100644 --- a/drivers/gpu/drm/arm/malidp_planes.c +++ b/drivers/gpu/drm/arm/malidp_planes.c @@ -184,9 +184,9 @@ static void malidp_de_plane_update(struct drm_plane *plane, if (plane->state->rotation & DRM_ROTATE_MASK) val = ilog2(plane->state->rotation & DRM_ROTATE_MASK) << LAYER_ROT_OFFSET; if (plane->state->rotation & DRM_REFLECT_X) - val |= LAYER_V_FLIP; - if (plane->state->rotation & DRM_REFLECT_Y) val |= LAYER_H_FLIP; + if (plane->state->rotation & DRM_REFLECT_Y) + val |= LAYER_V_FLIP; /* set the 'enable layer' bit */ val |= LAYER_ENABLE; From fd9597d6ea2803f1929bc9e91ea159e5a9be8480 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 20 Jan 2017 16:20:11 +0100 Subject: [PATCH 201/555] HID: wacom: release the resources before leaving despite devm [ Upstream commit 5b779fc52020ac6f5beea31c5eafc3d25cf70dc1 ] In the general case, the resources are properly released by devm without needing to do anything. However, when unplugging the wireless receiver, the kernel segfaults from time to time while calling devres_release_all(). I think in that case the resources attempt to access hid_get_drvdata(hdev) which has been set to null while leaving wacom_remove(). Signed-off-by: Benjamin Tissoires Acked-by: Jason Gerecke Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom_sys.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 0c535d0f3b95..53ac19b3727a 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2433,6 +2433,8 @@ static void wacom_remove(struct hid_device *hdev) if (hdev->bus == BUS_BLUETOOTH) device_remove_file(&hdev->dev, &dev_attr_speed); + wacom_release_resources(wacom); + hid_set_drvdata(hdev, NULL); } From 30a0220a5b0bd343e1c55a3e861e33607aef787b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Jan 2017 16:18:40 +0100 Subject: [PATCH 202/555] MIPS: Lantiq: Fix another request_mem_region() return code check [ Upstream commit 98ea51cb0c8ce009d9da1fd7b48f0ff1d7a9bbb0 ] Hauke already fixed a couple of them, but one instance remains that checks for a negative integer when it should check for a NULL pointer: arch/mips/lantiq/xway/sysctrl.c: In function 'ltq_soc_init': arch/mips/lantiq/xway/sysctrl.c:473:19: error: ordered comparison of pointer with integer zero [-Werror=extra] Fixes: 6e807852676a ("MIPS: Lantiq: Fix check for return value of request_mem_region()") Signed-off-by: Arnd Bergmann Cc: John Crispin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/15043/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/lantiq/xway/sysctrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c index 90565477dfbd..95bec460b651 100644 --- a/arch/mips/lantiq/xway/sysctrl.c +++ b/arch/mips/lantiq/xway/sysctrl.c @@ -469,8 +469,8 @@ void __init ltq_soc_init(void) panic("Failed to load xbar nodes from devicetree"); if (of_address_to_resource(np_xbar, 0, &res_xbar)) panic("Failed to get xbar resources"); - if (request_mem_region(res_xbar.start, resource_size(&res_xbar), - res_xbar.name) < 0) + if (!request_mem_region(res_xbar.start, resource_size(&res_xbar), + res_xbar.name)) panic("Failed to get xbar resources"); ltq_xbar_membase = ioremap_nocache(res_xbar.start, From 6329973bee29d1f35f72b520076151c72c05d0b4 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 2 Jan 2017 15:18:21 +0530 Subject: [PATCH 203/555] mips: ath79: clock:- Unmap region obtained by of_iomap [ Upstream commit b3d91db3f71d5f70ea60d900425a3f96aeb3d065 ] Free memory mapping, if ath79_clocks_init_dt_ng is not successful. Signed-off-by: Arvind Yadav Fixes: 3bdf1071ba7d ("MIPS: ath79: update devicetree clock support for AR9132") Cc: antonynpavlov@gmail.com Cc: albeu@free.fr Cc: hackpascal@gmail.com Cc: sboyd@codeaurora.org Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/14915/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/ath79/clock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/mips/ath79/clock.c b/arch/mips/ath79/clock.c index cc3a1e33a600..7e2bb12b64ea 100644 --- a/arch/mips/ath79/clock.c +++ b/arch/mips/ath79/clock.c @@ -508,16 +508,19 @@ static void __init ath79_clocks_init_dt_ng(struct device_node *np) ar9330_clk_init(ref_clk, pll_base); else { pr_err("%s: could not find any appropriate clk_init()\n", dnfn); - goto err_clk; + goto err_iounmap; } if (of_clk_add_provider(np, of_clk_src_onecell_get, &clk_data)) { pr_err("%s: could not register clk provider\n", dnfn); - goto err_clk; + goto err_iounmap; } return; +err_iounmap: + iounmap(pll_base); + err_clk: clk_put(ref_clk); From c593091cfc1bee31b56b555c83dc5f0ed715d7ed Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Thu, 19 Jan 2017 11:40:13 +0100 Subject: [PATCH 204/555] lkdtm: Fix Oops when unloading the module [ Upstream commit 9ba60573638e2006170ebcc5489fb1e068afbc8f ] No jprobe is registered when the module is loaded without specifying a crashpoint that uses a jprobe. At the moment, we unconditionally try to unregister the jprobe on module unload which results in an Oops. Add a check to fix this. Signed-off-by: Juerg Haefliger Acked-by: Kees Cook Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/misc/lkdtm_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c index f9154b8d67f6..b2989f2d3126 100644 --- a/drivers/misc/lkdtm_core.c +++ b/drivers/misc/lkdtm_core.c @@ -533,7 +533,9 @@ static void __exit lkdtm_module_exit(void) /* Handle test-specific clean-up. */ lkdtm_usercopy_exit(); - unregister_jprobe(lkdtm_jprobe); + if (lkdtm_jprobe != NULL) + unregister_jprobe(lkdtm_jprobe); + pr_info("Crash point unregistered\n"); } From 1dee03af7325f8a04954f9114cb76945fddb950c Mon Sep 17 00:00:00 2001 From: Myungho Jung Date: Tue, 25 Apr 2017 11:58:15 -0700 Subject: [PATCH 205/555] net: core: Prevent from dereferencing null pointer when releasing SKB [ Upstream commit 9899886d5e8ec5b343b1efe44f185a0e68dc6454 ] Added NULL check to make __dev_kfree_skb_irq consistent with kfree family of functions. Link: https://bugzilla.kernel.org/show_bug.cgi?id=195289 Signed-off-by: Myungho Jung Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index ba7b8121a414..7f2caad46a3d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2355,6 +2355,9 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) { smp_rmb(); atomic_set(&skb->users, 0); From 897e8c528529f5b53d1a63bbac3802adf5e09170 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 25 Apr 2017 18:51:46 +0200 Subject: [PATCH 206/555] net/packet: check length in getsockopt() called with PACKET_HDRLEN [ Upstream commit fd2c83b35752f0a8236b976978ad4658df14a59f ] In the case getsockopt() is called with PACKET_HDRLEN and optlen < 4 |val| remains uninitialized and the syscall may behave differently depending on its value, and even copy garbage to userspace on certain architectures. To fix this we now return -EINVAL if optlen is too small. This bug has been detected with KMSAN. Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 35ba4b60d927..9c92c6cb6a4f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3884,6 +3884,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); + if (len < sizeof(int)) + return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { From 45eacc855552c4f227f396fa877c83e2a7837669 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Mon, 24 Apr 2017 18:29:16 +0800 Subject: [PATCH 207/555] team: fix memory leaks [ Upstream commit 72ec0bc64b9a5d8e0efcb717abfc757746b101b7 ] In functions team_nl_send_port_list_get() and team_nl_send_options_get(), pointer skb keeps the return value of nlmsg_new(). When the call to genlmsg_put() fails, the memory is not freed(). This will result in memory leak bugs. Fixes: 9b00cf2d1024 ("team: implement multipart netlink messages for options transfers") Signed-off-by: Pan Bian Acked-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/team/team.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index a380649bf6b5..26681707fc7a 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2366,8 +2366,10 @@ start_again: hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_OPTIONS_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(skb); return -EMSGSIZE; + } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; @@ -2639,8 +2641,10 @@ start_again: hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_PORT_LIST_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(skb); return -EMSGSIZE; + } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; From 65a7a7ce7ffd91f148c2fa0b60c797005918152f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roman=20Spycha=C5=82a?= Date: Thu, 20 Apr 2017 12:04:10 +0200 Subject: [PATCH 208/555] usb: plusb: Add support for PL-27A1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6f2aee0c0de65013333bbc26fe50c9c7b09a37f7 ] This patch adds support for the PL-27A1 by adding the appropriate USB ID's. This chip is used in the goobay Active USB 3.0 Data Link and Unitek Y-3501 cables. Signed-off-by: Roman Spychała Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/Kconfig | 2 +- drivers/net/usb/plusb.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index cdde59089f72..3a7286256db0 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -364,7 +364,7 @@ config USB_NET_NET1080 optionally with LEDs that indicate traffic config USB_NET_PLUSB - tristate "Prolific PL-2301/2302/25A1 based cables" + tristate "Prolific PL-2301/2302/25A1/27A1 based cables" # if the handshake/init/reset problems, from original 'plusb', # are ever resolved ... then remove "experimental" depends on USB_USBNET diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c index 22e1a9a99a7d..6fe59373cba9 100644 --- a/drivers/net/usb/plusb.c +++ b/drivers/net/usb/plusb.c @@ -102,7 +102,7 @@ static int pl_reset(struct usbnet *dev) } static const struct driver_info prolific_info = { - .description = "Prolific PL-2301/PL-2302/PL-25A1", + .description = "Prolific PL-2301/PL-2302/PL-25A1/PL-27A1", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT, /* some PL-2302 versions seem to fail usb_set_interface() */ .reset = pl_reset, @@ -139,6 +139,17 @@ static const struct usb_device_id products [] = { * Host-to-Host Cable */ .driver_info = (unsigned long) &prolific_info, + +}, + +/* super speed cables */ +{ + USB_DEVICE(0x067b, 0x27a1), /* PL-27A1, no eeprom + * also: goobay Active USB 3.0 + * Data Link, + * Unitek Y-3501 + */ + .driver_info = (unsigned long) &prolific_info, }, { }, // END @@ -158,5 +169,5 @@ static struct usb_driver plusb_driver = { module_usb_driver(plusb_driver); MODULE_AUTHOR("David Brownell"); -MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1 USB Host to Host Link Driver"); +MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1/27A1 USB Host to Host Link Driver"); MODULE_LICENSE("GPL"); From affd26096a590f6a58e6ee8629fcde43cfbed3f6 Mon Sep 17 00:00:00 2001 From: Ansis Atteka Date: Fri, 21 Apr 2017 15:23:05 -0700 Subject: [PATCH 209/555] udp: disable inner UDP checksum offloads in IPsec case [ Upstream commit b40c5f4fde22fb98eff205b3aece05b471c24eed ] Otherwise, UDP checksum offloads could corrupt ESP packets by attempting to calculate UDP checksum when this inner UDP packet is already protected by IPsec. One way to reproduce this bug is to have a VM with virtio_net driver (UFO set to ON in the guest VM); and then encapsulate all guest's Ethernet frames in Geneve; and then further encrypt Geneve with IPsec. In this case following symptoms are observed: 1. If using ixgbe NIC, then it will complain with following error message: ixgbe 0000:01:00.1: partial checksum but l4 proto=32! 2. Receiving IPsec stack will drop all the corrupted ESP packets and increase XfrmInStateProtoError counter in /proc/net/xfrm_stat. 3. iperf UDP test from the VM with packet sizes above MTU will not work at all. 4. iperf TCP test from the VM will get ridiculously low performance because. Signed-off-by: Ansis Atteka Co-authored-by: Steffen Klassert Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/udp_offload.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6de016f80f17..0932c85b42af 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -29,6 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, u16 mac_len = skb->mac_len; int udp_offset, outer_hlen; __wsum partial; + bool need_ipsec; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; @@ -62,8 +63,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && + !need_ipsec && (skb->dev->features & (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); From f06316859ce6b812bd791c2092da6351cc6829ab Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 24 Apr 2017 14:27:21 -0700 Subject: [PATCH 210/555] net: dsa: b53: Include IMP/CPU port in dumb forwarding mode [ Upstream commit a424f0de61638cbb5047e0a888c54da9cf471f90 ] Since Broadcom tags are not enabled in b53 (DSA_PROTO_TAG_NONE), we need to make sure that the IMP/CPU port is included in the forwarding decision. Without this change, switching between non-management ports would work, but not between management ports and non-management ports thus breaking the default state in which DSA switch are brought up. Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch") Reported-by: Eric Anholt Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/b53/b53_common.c | 10 ++++++++++ drivers/net/dsa/b53/b53_regs.h | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 3ec573c13dac..c26debc531ee 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -326,6 +326,7 @@ static void b53_get_vlan_entry(struct b53_device *dev, u16 vid, static void b53_set_forwarding(struct b53_device *dev, int enable) { + struct dsa_switch *ds = dev->ds; u8 mgmt; b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, &mgmt); @@ -336,6 +337,15 @@ static void b53_set_forwarding(struct b53_device *dev, int enable) mgmt &= ~SM_SW_FWD_EN; b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt); + + /* Include IMP port in dumb forwarding mode when no tagging protocol is + * set + */ + if (ds->ops->get_tag_protocol(ds) == DSA_TAG_PROTO_NONE) { + b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, &mgmt); + mgmt |= B53_MII_DUMB_FWDG_EN; + b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt); + } } static void b53_enable_vlan(struct b53_device *dev, bool enable) diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index dac0af4e2cd0..81044000ce75 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -104,6 +104,10 @@ #define B53_UC_FWD_EN BIT(6) #define B53_MC_FWD_EN BIT(7) +/* Switch control (8 bit) */ +#define B53_SWITCH_CTRL 0x22 +#define B53_MII_DUMB_FWDG_EN BIT(6) + /* (16 bit) */ #define B53_UC_FLOOD_MASK 0x32 #define B53_MC_FLOOD_MASK 0x34 From 8f9bd136b50b15e1ddf12281b34802e948023619 Mon Sep 17 00:00:00 2001 From: "sudarsana.kalluru@cavium.com" Date: Wed, 19 Apr 2017 03:19:54 -0700 Subject: [PATCH 211/555] qed: Fix possible system hang in the dcbnl-getdcbx() path. [ Upstream commit 62289ba27558553871fd047baadaaeda886c6a63 ] qed_dcbnl_get_dcbx() API uses kmalloc in GFT_KERNEL mode. The API gets invoked in the interrupt context by qed_dcbnl_getdcbx callback. Need to invoke this kmalloc in atomic mode. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index a4789a93b692..9d59cb85c012 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -1222,7 +1222,7 @@ static struct qed_dcbx_get *qed_dcbnl_get_dcbx(struct qed_hwfn *hwfn, { struct qed_dcbx_get *dcbx_info; - dcbx_info = kzalloc(sizeof(*dcbx_info), GFP_KERNEL); + dcbx_info = kmalloc(sizeof(*dcbx_info), GFP_ATOMIC); if (!dcbx_info) return NULL; From 727a153435fa431c3c99cf5718976bdb73702d06 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 29 Mar 2017 20:54:37 +0200 Subject: [PATCH 212/555] mmc: sdio: fix alignment issue in struct sdio_func [ Upstream commit 5ef1ecf060f28ecef313b5723f1fd39bf5a35f56 ] Certain 64-bit systems (e.g. Amlogic Meson GX) require buffers to be used for DMA to be 8-byte-aligned. struct sdio_func has an embedded small DMA buffer not meeting this requirement. When testing switching to descriptor chain mode in meson-gx driver SDIO is broken therefore. Fix this by allocating the small DMA buffer separately as kmalloc ensures that the returned memory area is properly aligned for every basic data type. Signed-off-by: Heiner Kallweit Tested-by: Helmut Klein Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/sdio_bus.c | 12 +++++++++++- include/linux/mmc/sdio_func.h | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 86f5b3223aae..d56a3b6c2fb9 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -266,7 +266,7 @@ static void sdio_release_func(struct device *dev) sdio_free_func_cis(func); kfree(func->info); - + kfree(func->tmpbuf); kfree(func); } @@ -281,6 +281,16 @@ struct sdio_func *sdio_alloc_func(struct mmc_card *card) if (!func) return ERR_PTR(-ENOMEM); + /* + * allocate buffer separately to make sure it's properly aligned for + * DMA usage (incl. 64 bit DMA) + */ + func->tmpbuf = kmalloc(4, GFP_KERNEL); + if (!func->tmpbuf) { + kfree(func); + return ERR_PTR(-ENOMEM); + } + func->card = card; device_initialize(&func->dev); diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h index aab032a6ae61..97ca105347a6 100644 --- a/include/linux/mmc/sdio_func.h +++ b/include/linux/mmc/sdio_func.h @@ -53,7 +53,7 @@ struct sdio_func { unsigned int state; /* function state */ #define SDIO_STATE_PRESENT (1<<0) /* present in sysfs */ - u8 tmpbuf[4]; /* DMA:able scratch buffer */ + u8 *tmpbuf; /* DMA:able scratch buffer */ unsigned num_info; /* number of info strings */ const char **info; /* info strings */ From 7e2a755497f3f8625fbbbc671f8628d769c0cb36 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 10 Apr 2017 14:59:28 +0300 Subject: [PATCH 213/555] bridge: netlink: register netdevice before executing changelink [ Upstream commit 5b8d5429daa05bebef6ffd3297df3b502cc6f184 ] Peter reported a kernel oops when executing the following command: $ ip link add name test type bridge vlan_default_pvid 1 [13634.939408] BUG: unable to handle kernel NULL pointer dereference at 0000000000000190 [13634.939436] IP: __vlan_add+0x73/0x5f0 [...] [13634.939783] Call Trace: [13634.939791] ? pcpu_next_unpop+0x3b/0x50 [13634.939801] ? pcpu_alloc+0x3d2/0x680 [13634.939810] ? br_vlan_add+0x135/0x1b0 [13634.939820] ? __br_vlan_set_default_pvid.part.28+0x204/0x2b0 [13634.939834] ? br_changelink+0x120/0x4e0 [13634.939844] ? br_dev_newlink+0x50/0x70 [13634.939854] ? rtnl_newlink+0x5f5/0x8a0 [13634.939864] ? rtnl_newlink+0x176/0x8a0 [13634.939874] ? mem_cgroup_commit_charge+0x7c/0x4e0 [13634.939886] ? rtnetlink_rcv_msg+0xe1/0x220 [13634.939896] ? lookup_fast+0x52/0x370 [13634.939905] ? rtnl_newlink+0x8a0/0x8a0 [13634.939915] ? netlink_rcv_skb+0xa1/0xc0 [13634.939925] ? rtnetlink_rcv+0x24/0x30 [13634.939934] ? netlink_unicast+0x177/0x220 [13634.939944] ? netlink_sendmsg+0x2fe/0x3b0 [13634.939954] ? _copy_from_user+0x39/0x40 [13634.939964] ? sock_sendmsg+0x30/0x40 [13634.940159] ? ___sys_sendmsg+0x29d/0x2b0 [13634.940326] ? __alloc_pages_nodemask+0xdf/0x230 [13634.940478] ? mem_cgroup_commit_charge+0x7c/0x4e0 [13634.940592] ? mem_cgroup_try_charge+0x76/0x1a0 [13634.940701] ? __handle_mm_fault+0xdb9/0x10b0 [13634.940809] ? __sys_sendmsg+0x51/0x90 [13634.940917] ? entry_SYSCALL_64_fastpath+0x1e/0xad The problem is that the bridge's VLAN group is created after setting the default PVID, when registering the netdevice and executing its ndo_init(). Fix this by changing the order of both operations, so that br_changelink() is only processed after the netdevice is registered, when the VLAN group is already initialized. Fixes: b6677449dff6 ("bridge: netlink: call br_changelink() during br_dev_newlink()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: Ido Schimmel Reported-by: Peter V. Saveliev Tested-by: Peter V. Saveliev Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 7625ec8458de..5d4006e589cb 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1098,11 +1098,14 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, spin_unlock_bh(&br->lock); } - err = br_changelink(dev, tb, data); + err = register_netdevice(dev); if (err) return err; - return register_netdevice(dev); + err = br_changelink(dev, tb, data); + if (err) + unregister_netdevice(dev); + return err; } static size_t br_get_size(const struct net_device *brdev) From c17acd24c682322563c9c35bd7ed94f13fed3664 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 7 Apr 2017 13:11:10 -0700 Subject: [PATCH 214/555] Btrfs: fix segmentation fault when doing dio read [ Upstream commit 97bf5a5589aa3a59c60aa775fc12ec0483fc5002 ] Commit 2dabb3248453 ("Btrfs: Direct I/O read: Work on sectorsized blocks") introduced this bug during iterating bio pages in dio read's endio hook, and it could end up with segment fault of the dio reading task. So the reason is 'if (nr_sectors--)', and it makes the code assume that there is one more block in the same page, so page offset is increased and the bio which is created to repair the bad block then has an incorrect bvec.bv_offset, and a later access of the page content would throw a segmentation fault. This also adds ASSERT to check page offset against page size. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8a05fa7e2152..f089d7d8afe7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8050,8 +8050,10 @@ next_block_or_try_again: start += sectorsize; - if (nr_sectors--) { + nr_sectors--; + if (nr_sectors) { pgoff += sectorsize; + ASSERT(pgoff < PAGE_SIZE); goto next_block_or_try_again; } } @@ -8157,8 +8159,10 @@ next: ASSERT(nr_sectors); - if (--nr_sectors) { + nr_sectors--; + if (nr_sectors) { pgoff += sectorsize; + ASSERT(pgoff < PAGE_SIZE); goto next_block; } } From 26899ca9cc6fc5e29c1f489d76a246bcad745f2d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 10 Apr 2017 12:36:26 -0700 Subject: [PATCH 215/555] Btrfs: fix potential use-after-free for cloned bio [ Upstream commit a967efb30b3afa3d858edd6a17f544f9e9e46eea ] KASAN reports that there is a use-after-free case of bio in btrfs_map_bio. If we need to submit IOs to several disks at a time, the original bio would get cloned and mapped to the destination disk, but we really should use the original bio instead of a cloned bio to do the sanity check because cloned bios are likely to be freed by its endio. Reported-by: Diego Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 71a60cc01451..06a77e47957d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6226,7 +6226,7 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || - (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { + (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); continue; } From c533c11d8f7a4c93d2f11deeeaf0d69a3a76412c Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Fri, 31 Mar 2017 20:35:42 +0200 Subject: [PATCH 216/555] sata_via: Enable hotplug only on VT6421 [ Upstream commit 3cf864520e877505158f09075794a08abab11bbe ] Commit 57e5568fda27 ("sata_via: Implement hotplug for VT6421") adds hotplug IRQ handler for VT6421 but enables hotplug on all chips. This is a bug because it causes "irq xx: nobody cared" error on VT6420 when hot-(un)plugging a drive: [ 381.839948] irq 20: nobody cared (try booting with the "irqpoll" option) [ 381.840014] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-rc5+ #148 [ 381.840066] Hardware name: P4VM800/P4VM800, BIOS P1.60 05/29/2006 [ 381.840117] Call Trace: [ 381.840167] [ 381.840225] ? dump_stack+0x44/0x58 [ 381.840278] ? __report_bad_irq+0x14/0x97 [ 381.840327] ? handle_edge_irq+0xa5/0xa5 [ 381.840376] ? note_interrupt+0x155/0x1cf [ 381.840426] ? handle_edge_irq+0xa5/0xa5 [ 381.840474] ? handle_irq_event_percpu+0x32/0x38 [ 381.840524] ? handle_irq_event+0x1f/0x38 [ 381.840573] ? handle_fasteoi_irq+0x69/0xb8 [ 381.840625] ? handle_irq+0x4f/0x5d [ 381.840672] [ 381.840726] ? do_IRQ+0x2e/0x8b [ 381.840782] ? common_interrupt+0x2c/0x34 [ 381.840836] ? mwait_idle+0x60/0x82 [ 381.840892] ? arch_cpu_idle+0x6/0x7 [ 381.840949] ? do_idle+0x96/0x18e [ 381.841002] ? cpu_startup_entry+0x16/0x1a [ 381.841057] ? start_kernel+0x319/0x31c [ 381.841111] ? startup_32_smp+0x166/0x168 [ 381.841165] handlers: [ 381.841219] [] ata_bmdma_interrupt [ 381.841274] Disabling IRQ #20 Seems that VT6420 can do hotplug too (there's no documentation) but the comments say that SCR register access (required for detecting hotplug events) can cause problems on these chips. For now, just keep hotplug disabled on anything other than VT6421. Signed-off-by: Ondrej Zary Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/ata/sata_via.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/ata/sata_via.c b/drivers/ata/sata_via.c index 0636d84fbefe..f3f538eec7b3 100644 --- a/drivers/ata/sata_via.c +++ b/drivers/ata/sata_via.c @@ -644,14 +644,16 @@ static void svia_configure(struct pci_dev *pdev, int board_id, pci_write_config_byte(pdev, SATA_NATIVE_MODE, tmp8); } - /* enable IRQ on hotplug */ - pci_read_config_byte(pdev, SVIA_MISC_3, &tmp8); - if ((tmp8 & SATA_HOTPLUG) != SATA_HOTPLUG) { - dev_dbg(&pdev->dev, - "enabling SATA hotplug (0x%x)\n", - (int) tmp8); - tmp8 |= SATA_HOTPLUG; - pci_write_config_byte(pdev, SVIA_MISC_3, tmp8); + if (board_id == vt6421) { + /* enable IRQ on hotplug */ + pci_read_config_byte(pdev, SVIA_MISC_3, &tmp8); + if ((tmp8 & SATA_HOTPLUG) != SATA_HOTPLUG) { + dev_dbg(&pdev->dev, + "enabling SATA hotplug (0x%x)\n", + (int) tmp8); + tmp8 |= SATA_HOTPLUG; + pci_write_config_byte(pdev, SVIA_MISC_3, tmp8); + } } /* From dd9640717f3f6abef2b16ba3cbc6eeb203172124 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 31 Mar 2017 15:12:01 -0700 Subject: [PATCH 217/555] hugetlbfs: initialize shared policy as part of inode allocation [ Upstream commit 4742a35d9de745e867405b4311e1aac412f0ace1 ] Any time after inode allocation, destroy_inode can be called. The hugetlbfs inode contains a shared_policy structure, and mpol_free_shared_policy is unconditionally called as part of hugetlbfs_destroy_inode. Initialize the policy as part of inode allocation so that any quick (error path) calls to destroy_inode will be handed an initialized policy. syzkaller fuzzer found this bug, that resulted in the following: BUG: KASAN: user-memory-access in atomic_inc include/asm-generic/atomic-instrumented.h:87 [inline] at addr 000000131730bd7a BUG: KASAN: user-memory-access in __lock_acquire+0x21a/0x3a80 kernel/locking/lockdep.c:3239 at addr 000000131730bd7a Write of size 4 by task syz-executor6/14086 CPU: 3 PID: 14086 Comm: syz-executor6 Not tainted 4.11.0-rc3+ #364 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: atomic_inc include/asm-generic/atomic-instrumented.h:87 [inline] __lock_acquire+0x21a/0x3a80 kernel/locking/lockdep.c:3239 lock_acquire+0x1ee/0x590 kernel/locking/lockdep.c:3762 __raw_write_lock include/linux/rwlock_api_smp.h:210 [inline] _raw_write_lock+0x33/0x50 kernel/locking/spinlock.c:295 mpol_free_shared_policy+0x43/0xb0 mm/mempolicy.c:2536 hugetlbfs_destroy_inode+0xca/0x120 fs/hugetlbfs/inode.c:952 alloc_inode+0x10d/0x180 fs/inode.c:216 new_inode_pseudo+0x69/0x190 fs/inode.c:889 new_inode+0x1c/0x40 fs/inode.c:918 hugetlbfs_get_inode+0x40/0x420 fs/hugetlbfs/inode.c:734 hugetlb_file_setup+0x329/0x9f0 fs/hugetlbfs/inode.c:1282 newseg+0x422/0xd30 ipc/shm.c:575 ipcget_new ipc/util.c:285 [inline] ipcget+0x21e/0x580 ipc/util.c:639 SYSC_shmget ipc/shm.c:673 [inline] SyS_shmget+0x158/0x230 ipc/shm.c:657 entry_SYSCALL_64_fastpath+0x1f/0xc2 Analysis provided by Tetsuo Handa Link: http://lkml.kernel.org/r/1490477850-7944-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Dmitry Vyukov Acked-by: Hillf Danton Cc: Tetsuo Handa Cc: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/hugetlbfs/inode.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 704fa0b17309..2c2f182cde03 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -695,14 +695,11 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode->i_mode = S_IFDIR | config->mode; inode->i_uid = config->uid; inode->i_gid = config->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - info = HUGETLBFS_I(inode); - mpol_shared_policy_init(&info->policy, NULL); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -733,7 +730,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, @@ -741,15 +737,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; - info = HUGETLBFS_I(inode); - /* - * The policy is initialized here even if we are creating a - * private inode because initialization simply creates an - * an empty rb tree and calls rwlock_init(), later when we - * call mpol_free_shared_policy() it will just return because - * the rb tree will still be empty. - */ - mpol_shared_policy_init(&info->policy, NULL); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -937,6 +924,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } + + /* + * Any time after allocation, hugetlbfs_destroy_inode can be called + * for the inode. mpol_free_shared_policy is unconditionally called + * as part of hugetlbfs_destroy_inode. So, initialize policy here + * in case of a quick call to destroy. + * + * Note that the policy is initialized even if we are creating a + * private inode. This simplifies hugetlbfs_destroy_inode. + */ + mpol_shared_policy_init(&p->policy, NULL); + return &p->vfs_inode; } From 5435e4823d817d0af3381a046d60a8862d1072aa Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 31 Mar 2017 15:12:12 -0700 Subject: [PATCH 218/555] kasan: do not sanitize kexec purgatory [ Upstream commit 13a6798e4a03096b11bf402a063786a7be55d426 ] Fixes this: kexec: Undefined symbol: __asan_load8_noabort kexec-bzImage64: Loading purgatory failed Link: http://lkml.kernel.org/r/1489672155.4458.7.camel@gmx.de Signed-off-by: Mike Galbraith Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/purgatory/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 555b9fa0ad43..7dbdb780264d 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -8,6 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro +KASAN_SANITIZE := n KCOV_INSTRUMENT := n # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That From 07b653405e3ad2a9f4ff8e654898a72069fbc27a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 31 Mar 2017 15:12:10 -0700 Subject: [PATCH 219/555] drivers/rapidio/devices/tsi721.c: make module parameter variable name unique MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 4785603bd05b0b029c647080937674d9991600f9 ] kbuild test robot reported a non-static variable name collision between a staging driver and a RapidIO driver, with a generic variable name of 'dbg_level'. Both drivers should be changed so that they don't use this generic public variable name. This patch fixes the RapidIO driver but does not change the user interface (name) for the module parameter. drivers/staging/built-in.o:(.bss+0x109d0): multiple definition of `dbg_level' drivers/rapidio/built-in.o:(.bss+0x16c): first defined here Link: http://lkml.kernel.org/r/ab527fc5-aa3c-4b07-5d48-eef5de703192@infradead.org Signed-off-by: Randy Dunlap Reported-by: kbuild test robot Cc: Greg Kroah-Hartman Cc: Matt Porter Cc: Alexandre Bounine Cc: Jérémy Lefaure Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/rapidio/devices/tsi721.c | 4 ++-- drivers/rapidio/devices/tsi721.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index 9d19b9a62011..315a4be8dc1e 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -37,8 +37,8 @@ #include "tsi721.h" #ifdef DEBUG -u32 dbg_level; -module_param(dbg_level, uint, S_IWUSR | S_IRUGO); +u32 tsi_dbg_level; +module_param_named(dbg_level, tsi_dbg_level, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)"); #endif diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index 5941437cbdd1..957eadc58150 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -40,11 +40,11 @@ enum { }; #ifdef DEBUG -extern u32 dbg_level; +extern u32 tsi_dbg_level; #define tsi_debug(level, dev, fmt, arg...) \ do { \ - if (DBG_##level & dbg_level) \ + if (DBG_##level & tsi_dbg_level) \ dev_dbg(dev, "%s: " fmt "\n", __func__, ##arg); \ } while (0) #else From 146561a3f1c8d7f9ef9b77964c846da48c62a6ac Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 25 Mar 2017 08:53:12 +0800 Subject: [PATCH 220/555] netfilter: invoke synchronize_rcu after set the _hook_ to NULL [ Upstream commit 3b7dabf029478bb80507a6c4500ca94132a2bc0b ] Otherwise, another CPU may access the invalid pointer. For example: CPU0 CPU1 - rcu_read_lock(); - pfunc = _hook_; _hook_ = NULL; - mod unload - - pfunc(); // invalid, panic - rcu_read_unlock(); So we must call synchronize_rcu() to wait the rcu reader to finish. Also note, in nf_nat_snmp_basic_fini, synchronize_rcu() will be invoked by later nf_conntrack_helper_unregister, but I'm inclined to add a explicit synchronize_rcu after set the nf_nat_snmp_hook to NULL. Depend on such obscure assumptions is not a good idea. Last, in nfnetlink_cttimeout, we use kfree_rcu to free the time object, so in cttimeout_exit, invoking rcu_barrier() is not necessary at all, remove it too. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/netfilter/nf_nat_snmp_basic.c | 1 + net/netfilter/nf_conntrack_ecache.c | 2 ++ net/netfilter/nf_conntrack_netlink.c | 1 + net/netfilter/nf_nat_core.c | 2 ++ net/netfilter/nfnetlink_cttimeout.c | 2 +- 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index c9b52c361da2..5a8f7c360887 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -1304,6 +1304,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); + synchronize_rcu(); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index da9df2d56e66..22fc32143e9c 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -290,6 +290,7 @@ void nf_conntrack_unregister_notifier(struct net *net, BUG_ON(notify != new); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); + /* synchronize_rcu() is called from ctnetlink_exit. */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); @@ -326,6 +327,7 @@ void nf_ct_expect_unregister_notifier(struct net *net, BUG_ON(notify != new); RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); + /* synchronize_rcu() is called from ctnetlink_exit. */ } EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 04111c1c3988..d5caed5bcfb1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3413,6 +3413,7 @@ static void __exit ctnetlink_exit(void) #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT RCU_INIT_POINTER(nfnl_ct_hook, NULL); #endif + synchronize_rcu(); } module_init(ctnetlink_init); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index dde64c4565d2..2916f4815c9c 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -892,6 +892,8 @@ static void __exit nf_nat_cleanup(void) #ifdef CONFIG_XFRM RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); #endif + synchronize_rcu(); + for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 139e0867e56e..47d6656c9119 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -646,8 +646,8 @@ static void __exit cttimeout_exit(void) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); + synchronize_rcu(); #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - rcu_barrier(); } module_init(cttimeout_init); From 3798fd14b970560d7ebbd3e3f388340476525e4c Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 21 Mar 2017 14:52:25 +0000 Subject: [PATCH 221/555] MIPS: IRQ Stack: Unwind IRQ stack onto task stack [ Upstream commit db8466c581cca1a08b505f1319c3ecd246f16fa8 ] When the separate IRQ stack was introduced, stack unwinding only proceeded as far as the top of the IRQ stack, leading to kernel backtraces being less useful, lacking the trace of what was interrupted. Fix this by providing a means for the kernel to unwind the IRQ stack onto the interrupted task stack. The processor state is saved to the kernel task stack on interrupt. The IRQ_STACK_START macro reserves an unsigned long at the top of the IRQ stack where the interrupted task stack pointer can be saved. After the active stack is switched to the IRQ stack, save the interrupted tasks stack pointer to the reserved location. Fix the stack unwinding code to look for the frame being the top of the IRQ stack and if so get the next frame from the saved location. The existing test does not work with the separate stack since the ra is no longer pointed at ret_from_{irq,exception}. The test to stop unwinding the stack 32 bytes from the top of a stack must be modified to allow unwinding to continue up to the location of the saved task stack pointer when on the IRQ stack. The low / high marks of the stack are set depending on whether the sp is on an irq stack or not. Signed-off-by: Matt Redfearn Cc: Paolo Bonzini Cc: Marcin Nowakowski Cc: Masanari Iida Cc: Chris Metcalf Cc: James Hogan Cc: Paul Burton Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Andrew Morton Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/15788/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/include/asm/irq.h | 15 +++++++++ arch/mips/kernel/asm-offsets.c | 1 + arch/mips/kernel/genex.S | 8 +++-- arch/mips/kernel/process.c | 56 +++++++++++++++++++++++----------- 4 files changed, 60 insertions(+), 20 deletions(-) diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h index 956db6e201d1..ddd1c918103b 100644 --- a/arch/mips/include/asm/irq.h +++ b/arch/mips/include/asm/irq.h @@ -18,9 +18,24 @@ #include #define IRQ_STACK_SIZE THREAD_SIZE +#define IRQ_STACK_START (IRQ_STACK_SIZE - sizeof(unsigned long)) extern void *irq_stack[NR_CPUS]; +/* + * The highest address on the IRQ stack contains a dummy frame put down in + * genex.S (handle_int & except_vec_vi_handler) which is structured as follows: + * + * top ------------ + * | task sp | <- irq_stack[cpu] + IRQ_STACK_START + * ------------ + * | | <- First frame of IRQ context + * ------------ + * + * task sp holds a copy of the task stack pointer where the struct pt_regs + * from exception entry can be found. + */ + static inline bool on_irq_stack(int cpu, unsigned long sp) { unsigned long low = (unsigned long)irq_stack[cpu]; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 4be2763f835d..bfff6ea45d51 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -103,6 +103,7 @@ void output_thread_info_defines(void) DEFINE(_THREAD_SIZE, THREAD_SIZE); DEFINE(_THREAD_MASK, THREAD_MASK); DEFINE(_IRQ_STACK_SIZE, IRQ_STACK_SIZE); + DEFINE(_IRQ_STACK_START, IRQ_STACK_START); BLANK(); } diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S index 2ac6c2625c13..ae810da4d499 100644 --- a/arch/mips/kernel/genex.S +++ b/arch/mips/kernel/genex.S @@ -215,9 +215,11 @@ NESTED(handle_int, PT_SIZE, sp) beq t0, t1, 2f /* Switch to IRQ stack */ - li t1, _IRQ_STACK_SIZE + li t1, _IRQ_STACK_START PTR_ADD sp, t0, t1 + /* Save task's sp on IRQ stack so that unwinding can follow it */ + LONG_S s1, 0(sp) 2: jal plat_irq_dispatch @@ -325,9 +327,11 @@ NESTED(except_vec_vi_handler, 0, sp) beq t0, t1, 2f /* Switch to IRQ stack */ - li t1, _IRQ_STACK_SIZE + li t1, _IRQ_STACK_START PTR_ADD sp, t0, t1 + /* Save task's sp on IRQ stack so that unwinding can follow it */ + LONG_S s1, 0(sp) 2: jalr v0 diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index fbbf5fcc695a..1b50958a1373 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -487,31 +487,52 @@ unsigned long notrace unwind_stack_by_address(unsigned long stack_page, unsigned long pc, unsigned long *ra) { + unsigned long low, high, irq_stack_high; struct mips_frame_info info; unsigned long size, ofs; + struct pt_regs *regs; int leaf; - extern void ret_from_irq(void); - extern void ret_from_exception(void); if (!stack_page) return 0; /* - * If we reached the bottom of interrupt context, - * return saved pc in pt_regs. + * IRQ stacks start at IRQ_STACK_START + * task stacks at THREAD_SIZE - 32 */ - if (pc == (unsigned long)ret_from_irq || - pc == (unsigned long)ret_from_exception) { - struct pt_regs *regs; - if (*sp >= stack_page && - *sp + sizeof(*regs) <= stack_page + THREAD_SIZE - 32) { - regs = (struct pt_regs *)*sp; - pc = regs->cp0_epc; - if (!user_mode(regs) && __kernel_text_address(pc)) { - *sp = regs->regs[29]; - *ra = regs->regs[31]; - return pc; - } + low = stack_page; + if (!preemptible() && on_irq_stack(raw_smp_processor_id(), *sp)) { + high = stack_page + IRQ_STACK_START; + irq_stack_high = high; + } else { + high = stack_page + THREAD_SIZE - 32; + irq_stack_high = 0; + } + + /* + * If we reached the top of the interrupt stack, start unwinding + * the interrupted task stack. + */ + if (unlikely(*sp == irq_stack_high)) { + unsigned long task_sp = *(unsigned long *)*sp; + + /* + * Check that the pointer saved in the IRQ stack head points to + * something within the stack of the current task + */ + if (!object_is_on_stack((void *)task_sp)) + return 0; + + /* + * Follow pointer to tasks kernel stack frame where interrupted + * state was saved. + */ + regs = (struct pt_regs *)task_sp; + pc = regs->cp0_epc; + if (!user_mode(regs) && __kernel_text_address(pc)) { + *sp = regs->regs[29]; + *ra = regs->regs[31]; + return pc; } return 0; } @@ -532,8 +553,7 @@ unsigned long notrace unwind_stack_by_address(unsigned long stack_page, if (leaf < 0) return 0; - if (*sp < stack_page || - *sp + info.frame_size > stack_page + THREAD_SIZE - 32) + if (*sp < low || *sp + info.frame_size > high) return 0; if (leaf) From 61b203816b171130659e0168c263d195a6c41a44 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 20 Mar 2017 10:17:56 +0100 Subject: [PATCH 222/555] iommu/exynos: Block SYSMMU while invalidating FLPD cache [ Upstream commit 7d2aa6b814476a2e2794960f844344519246df72 ] Documentation specifies that SYSMMU should be in blocked state while performing TLB/FLPD cache invalidation, so add needed calls to sysmmu_block/unblock. Fixes: 66a7ed84b345d ("iommu/exynos: Apply workaround of caching fault page table entries") CC: stable@vger.kernel.org # v4.10+ Signed-off-by: Marek Szyprowski Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/exynos-iommu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 30808e91b775..c7820b3ea80e 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -542,7 +542,10 @@ static void sysmmu_tlb_invalidate_flpdcache(struct sysmmu_drvdata *data, spin_lock_irqsave(&data->lock, flags); if (is_sysmmu_active(data) && data->version >= MAKE_MMU_VER(3, 3)) { clk_enable(data->clk_master); - __sysmmu_tlb_invalidate_entry(data, iova, 1); + if (sysmmu_block(data)) { + __sysmmu_tlb_invalidate_entry(data, iova, 1); + sysmmu_unblock(data); + } clk_disable(data->clk_master); } spin_unlock_irqrestore(&data->lock, flags); From db6767e2fdca12b8df8e3efdf567c434c47fb7d0 Mon Sep 17 00:00:00 2001 From: Thibault Saunier Date: Wed, 1 Feb 2017 18:05:21 -0200 Subject: [PATCH 223/555] exynos-gsc: Do not swap cb/cr for semi planar formats [ Upstream commit d7f3e33df4fbdc9855fb151f4a328ec46447e3ba ] In the case of semi planar formats cb and cr are in the same plane in memory, meaning that will be set to 'cb' whatever the format is, and whatever the (packed) order of those components are. Suggested-by: Nicolas Dufresne Signed-off-by: Thibault Saunier Signed-off-by: Javier Martinez Canillas Acked-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/media/platform/exynos-gsc/gsc-core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/media/platform/exynos-gsc/gsc-core.c b/drivers/media/platform/exynos-gsc/gsc-core.c index 787bd16c19e5..bbb5feef8308 100644 --- a/drivers/media/platform/exynos-gsc/gsc-core.c +++ b/drivers/media/platform/exynos-gsc/gsc-core.c @@ -849,9 +849,7 @@ int gsc_prepare_addr(struct gsc_ctx *ctx, struct vb2_buffer *vb, if ((frame->fmt->pixelformat == V4L2_PIX_FMT_VYUY) || (frame->fmt->pixelformat == V4L2_PIX_FMT_YVYU) || - (frame->fmt->pixelformat == V4L2_PIX_FMT_NV61) || (frame->fmt->pixelformat == V4L2_PIX_FMT_YVU420) || - (frame->fmt->pixelformat == V4L2_PIX_FMT_NV21) || (frame->fmt->pixelformat == V4L2_PIX_FMT_YVU420M)) swap(addr->cb, addr->cr); From 67e8be27ff725764263998e6ea179a7a05517466 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 21 Mar 2017 14:39:19 +0000 Subject: [PATCH 224/555] MIPS: smp-cps: Fix retrieval of VPE mask on big endian CPUs [ Upstream commit fb2155e3c30dc2043b52020e26965067a3e7779c ] The vpe_mask member of struct core_boot_config is of type atomic_t, which is a 32bit type. In cps-vec.S this member was being retrieved by a PTR_L macro, which on 64bit systems is a 64bit load. On little endian systems this is OK, since the double word that is retrieved will have the required less significant word in the correct position. However, on big endian systems the less significant word of the load is retrieved from address+4, and the more significant from address+0. The destination register therefore ends up with the required word in the more significant word e.g. when starting the second VP of a big endian 64bit system, the load PTR_L ta2, COREBOOTCFG_VPEMASK(a0) ends up setting register ta2 to 0x0000000300000000 When this value is written to the CPC it is ignored, since it is invalid to write anything larger than 4 bits. This results in any VP other than VP0 in a core failing to start in 64bit big endian systems. Change the load to a 32bit load word instruction to fix the bug. Fixes: f12401d7219f ("MIPS: smp-cps: Pull boot config retrieval out of mips_cps_boot_vpes") Signed-off-by: Matt Redfearn Cc: Paul Burton Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/15787/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/cps-vec.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S index 59476a607add..a00e87b0256d 100644 --- a/arch/mips/kernel/cps-vec.S +++ b/arch/mips/kernel/cps-vec.S @@ -361,7 +361,7 @@ LEAF(mips_cps_get_bootcfg) END(mips_cps_get_bootcfg) LEAF(mips_cps_boot_vpes) - PTR_L ta2, COREBOOTCFG_VPEMASK(a0) + lw ta2, COREBOOTCFG_VPEMASK(a0) PTR_L ta3, COREBOOTCFG_VPECONFIG(a0) #if defined(CONFIG_CPU_MIPSR6) From 9b6f9da9e55a41bb6ff1fbd0a3e7ebe3c33fec74 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 9 Mar 2017 13:26:07 +0200 Subject: [PATCH 225/555] nvme-rdma: handle cpu unplug when re-establishing the controller [ Upstream commit c248c64387fac5a6b31b343d9acb78f478e8619c ] If a cpu unplug event has occured, we need to take the minimum of the provided nr_io_queues and the number of online cpus, otherwise we won't be able to connect them as blk-mq mapping won't dispatch to those queues. Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/rdma.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 286fda4ee100..ab4f8db2a8ca 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -337,8 +337,6 @@ static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, struct ib_device *ibdev = dev->dev; int ret; - BUG_ON(queue_idx >= ctrl->queue_count); - ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (ret) @@ -643,8 +641,22 @@ out_free_queues: static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) { + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + unsigned int nr_io_queues; int i, ret; + nr_io_queues = min(opts->nr_io_queues, num_online_cpus()); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); + if (ret) + return ret; + + ctrl->queue_count = nr_io_queues + 1; + if (ctrl->queue_count < 2) + return 0; + + dev_info(ctrl->ctrl.device, + "creating %d I/O queues.\n", nr_io_queues); + for (i = 1; i < ctrl->queue_count; i++) { ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.opts->queue_size); @@ -1795,20 +1807,8 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) { - struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; int ret; - ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); - if (ret) - return ret; - - ctrl->queue_count = opts->nr_io_queues + 1; - if (ctrl->queue_count < 2) - return 0; - - dev_info(ctrl->ctrl.device, - "creating %d I/O queues.\n", opts->nr_io_queues); - ret = nvme_rdma_init_io_queues(ctrl); if (ret) return ret; From cd402b889606ed9b51d76594ddd059a8be6356a1 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 19 Mar 2017 22:35:59 +0800 Subject: [PATCH 226/555] netfilter: nfnl_cthelper: fix incorrect helper->expect_class_max [ Upstream commit ae5c682113f9f94cc5e76f92cf041ee624c173ee ] The helper->expect_class_max must be set to the total number of expect_policy minus 1, since we will use the statement "if (class > helper->expect_class_max)" to validate the CTA_EXPECT_CLASS attr in ctnetlink_alloc_expect. So for compatibility, set the helper->expect_class_max to the NFCTH_POLICY_SET_NUM attr's value minus 1. Also: it's invalid when the NFCTH_POLICY_SET_NUM attr's value is zero. 1. this will result "expect_policy = kzalloc(0, GFP_KERNEL);"; 2. we cannot set the helper->expect_class_max to a proper value. So if nla_get_be32(tb[NFCTH_POLICY_SET_NUM]) is zero, report -EINVAL to the userspace. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nfnetlink_cthelper.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 3b79f34b5095..b1fcfa08f0b4 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -161,6 +161,7 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, int i, ret; struct nf_conntrack_expect_policy *expect_policy; struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; + unsigned int class_max; ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, nfnl_cthelper_expect_policy_set); @@ -170,19 +171,18 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, if (!tb[NFCTH_POLICY_SET_NUM]) return -EINVAL; - helper->expect_class_max = - ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); - - if (helper->expect_class_max != 0 && - helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES) + class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); + if (class_max == 0) + return -EINVAL; + if (class_max > NF_CT_MAX_EXPECT_CLASSES) return -EOVERFLOW; expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) * - helper->expect_class_max, GFP_KERNEL); + class_max, GFP_KERNEL); if (expect_policy == NULL) return -ENOMEM; - for (i=0; iexpect_class_max; i++) { + for (i = 0; i < class_max; i++) { if (!tb[NFCTH_POLICY_SET+i]) goto err; @@ -191,6 +191,8 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, if (ret < 0) goto err; } + + helper->expect_class_max = class_max - 1; helper->expect_policy = expect_policy; return 0; err: @@ -377,10 +379,10 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb, goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, - htonl(helper->expect_class_max))) + htonl(helper->expect_class_max + 1))) goto nla_put_failure; - for (i=0; iexpect_class_max; i++) { + for (i = 0; i < helper->expect_class_max + 1; i++) { nest_parms2 = nla_nest_start(skb, (NFCTH_POLICY_SET+i) | NLA_F_NESTED); if (nest_parms2 == NULL) From 1cf8f9467e8658f1cb15127f4ac80019098b9d22 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 14 Mar 2017 15:24:51 +0530 Subject: [PATCH 227/555] parisc: perf: Fix potential NULL pointer dereference [ Upstream commit 74e3f6e63da6c8e8246fba1689e040bc926b4a1a ] Fix potential NULL pointer dereference and clean up coding style errors (code indent, trailing whitespaces). Signed-off-by: Arvind Yadav Signed-off-by: Helge Deller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/perf.c | 94 ++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index 518f4f5f1f43..d63d42533133 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -39,7 +39,7 @@ * the PDC INTRIGUE calls. This is done to eliminate bugs introduced * in various PDC revisions. The code is much more maintainable * and reliable this way vs having to debug on every version of PDC - * on every box. + * on every box. */ #include @@ -195,8 +195,8 @@ static int perf_config(uint32_t *image_ptr); static int perf_release(struct inode *inode, struct file *file); static int perf_open(struct inode *inode, struct file *file); static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t *ppos); -static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, - loff_t *ppos); +static ssize_t perf_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg); static void perf_start_counters(void); static int perf_stop_counters(uint32_t *raddr); @@ -222,7 +222,7 @@ extern void perf_intrigue_disable_perf_counters (void); /* * configure: * - * Configure the cpu with a given data image. First turn off the counters, + * Configure the cpu with a given data image. First turn off the counters, * then download the image, then turn the counters back on. */ static int perf_config(uint32_t *image_ptr) @@ -234,7 +234,7 @@ static int perf_config(uint32_t *image_ptr) error = perf_stop_counters(raddr); if (error != 0) { printk("perf_config: perf_stop_counters = %ld\n", error); - return -EINVAL; + return -EINVAL; } printk("Preparing to write image\n"); @@ -242,7 +242,7 @@ printk("Preparing to write image\n"); error = perf_write_image((uint64_t *)image_ptr); if (error != 0) { printk("perf_config: DOWNLOAD = %ld\n", error); - return -EINVAL; + return -EINVAL; } printk("Preparing to start counters\n"); @@ -254,7 +254,7 @@ printk("Preparing to start counters\n"); } /* - * Open the device and initialize all of its memory. The device is only + * Open the device and initialize all of its memory. The device is only * opened once, but can be "queried" by multiple processes that know its * file descriptor. */ @@ -298,8 +298,8 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t * called on the processor that the download should happen * on. */ -static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, - loff_t *ppos) +static ssize_t perf_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { int err; size_t image_size; @@ -307,11 +307,11 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun uint32_t interface_type; uint32_t test; - if (perf_processor_interface == ONYX_INTF) + if (perf_processor_interface == ONYX_INTF) image_size = PCXU_IMAGE_SIZE; - else if (perf_processor_interface == CUDA_INTF) + else if (perf_processor_interface == CUDA_INTF) image_size = PCXW_IMAGE_SIZE; - else + else return -EFAULT; if (!capable(CAP_SYS_ADMIN)) @@ -331,22 +331,22 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun /* First check the machine type is correct for the requested image */ - if (((perf_processor_interface == CUDA_INTF) && - (interface_type != CUDA_INTF)) || - ((perf_processor_interface == ONYX_INTF) && - (interface_type != ONYX_INTF))) + if (((perf_processor_interface == CUDA_INTF) && + (interface_type != CUDA_INTF)) || + ((perf_processor_interface == ONYX_INTF) && + (interface_type != ONYX_INTF))) return -EINVAL; /* Next check to make sure the requested image is valid */ - if (((interface_type == CUDA_INTF) && + if (((interface_type == CUDA_INTF) && (test >= MAX_CUDA_IMAGES)) || - ((interface_type == ONYX_INTF) && - (test >= MAX_ONYX_IMAGES))) + ((interface_type == ONYX_INTF) && + (test >= MAX_ONYX_IMAGES))) return -EINVAL; /* Copy the image into the processor */ - if (interface_type == CUDA_INTF) + if (interface_type == CUDA_INTF) return perf_config(cuda_images[test]); else return perf_config(onyx_images[test]); @@ -360,7 +360,7 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun static void perf_patch_images(void) { #if 0 /* FIXME!! */ -/* +/* * NOTE: this routine is VERY specific to the current TLB image. * If the image is changed, this routine might also need to be changed. */ @@ -368,9 +368,9 @@ static void perf_patch_images(void) extern void $i_dtlb_miss_2_0(); extern void PA2_0_iva(); - /* + /* * We can only use the lower 32-bits, the upper 32-bits should be 0 - * anyway given this is in the kernel + * anyway given this is in the kernel */ uint32_t itlb_addr = (uint32_t)&($i_itlb_miss_2_0); uint32_t dtlb_addr = (uint32_t)&($i_dtlb_miss_2_0); @@ -378,21 +378,21 @@ static void perf_patch_images(void) if (perf_processor_interface == ONYX_INTF) { /* clear last 2 bytes */ - onyx_images[TLBMISS][15] &= 0xffffff00; + onyx_images[TLBMISS][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[TLBMISS][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[TLBMISS][16] = (dtlb_addr << 8)&0xffffff00; onyx_images[TLBMISS][17] = itlb_addr; /* clear last 2 bytes */ - onyx_images[TLBHANDMISS][15] &= 0xffffff00; + onyx_images[TLBHANDMISS][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[TLBHANDMISS][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[TLBHANDMISS][16] = (dtlb_addr << 8)&0xffffff00; onyx_images[TLBHANDMISS][17] = itlb_addr; /* clear last 2 bytes */ - onyx_images[BIG_CPI][15] &= 0xffffff00; + onyx_images[BIG_CPI][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[BIG_CPI][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[BIG_CPI][16] = (dtlb_addr << 8)&0xffffff00; @@ -405,24 +405,24 @@ static void perf_patch_images(void) } else if (perf_processor_interface == CUDA_INTF) { /* Cuda interface */ - cuda_images[TLBMISS][16] = + cuda_images[TLBMISS][16] = (cuda_images[TLBMISS][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[TLBMISS][17] = + cuda_images[TLBMISS][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[TLBMISS][18] = (itlb_addr << 16)&0xffff0000; - cuda_images[TLBHANDMISS][16] = + cuda_images[TLBHANDMISS][16] = (cuda_images[TLBHANDMISS][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[TLBHANDMISS][17] = + cuda_images[TLBHANDMISS][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[TLBHANDMISS][18] = (itlb_addr << 16)&0xffff0000; - cuda_images[BIG_CPI][16] = + cuda_images[BIG_CPI][16] = (cuda_images[BIG_CPI][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[BIG_CPI][17] = + cuda_images[BIG_CPI][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[BIG_CPI][18] = (itlb_addr << 16)&0xffff0000; } else { @@ -434,7 +434,7 @@ static void perf_patch_images(void) /* * ioctl routine - * All routines effect the processor that they are executed on. Thus you + * All routines effect the processor that they are executed on. Thus you * must be running on the processor that you wish to change. */ @@ -460,7 +460,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } /* copy out the Counters */ - if (copy_to_user((void __user *)arg, raddr, + if (copy_to_user((void __user *)arg, raddr, sizeof (raddr)) != 0) { error = -EFAULT; break; @@ -488,7 +488,7 @@ static const struct file_operations perf_fops = { .open = perf_open, .release = perf_release }; - + static struct miscdevice perf_dev = { MISC_DYNAMIC_MINOR, PA_PERF_DEV, @@ -596,7 +596,7 @@ static int perf_stop_counters(uint32_t *raddr) /* OR sticky2 (bit 1496) to counter2 bit 32 */ tmp64 |= (userbuf[23] >> 8) & 0x0000000080000000; raddr[2] = (uint32_t)tmp64; - + /* Counter3 is bits 1497 to 1528 */ tmp64 = (userbuf[23] >> 7) & 0x00000000ffffffff; /* OR sticky3 (bit 1529) to counter3 bit 32 */ @@ -618,7 +618,7 @@ static int perf_stop_counters(uint32_t *raddr) userbuf[22] = 0; userbuf[23] = 0; - /* + /* * Write back the zeroed bytes + the image given * the read was destructive. */ @@ -626,13 +626,13 @@ static int perf_stop_counters(uint32_t *raddr) } else { /* - * Read RDR-15 which contains the counters and sticky bits + * Read RDR-15 which contains the counters and sticky bits */ if (!perf_rdr_read_ubuf(15, userbuf)) { return -13; } - /* + /* * Clear out the counters */ perf_rdr_clear(15); @@ -645,7 +645,7 @@ static int perf_stop_counters(uint32_t *raddr) raddr[2] = (uint32_t)((userbuf[1] >> 32) & 0x00000000ffffffffUL); raddr[3] = (uint32_t)(userbuf[1] & 0x00000000ffffffffUL); } - + return 0; } @@ -683,7 +683,7 @@ static int perf_rdr_read_ubuf(uint32_t rdr_num, uint64_t *buffer) i = tentry->num_words; while (i--) { buffer[i] = 0; - } + } /* Check for bits an even number of 64 */ if ((xbits = width & 0x03f) != 0) { @@ -809,18 +809,22 @@ static int perf_write_image(uint64_t *memaddr) } runway = ioremap_nocache(cpu_device->hpa.start, 4096); + if (!runway) { + pr_err("perf_write_image: ioremap failed!\n"); + return -ENOMEM; + } /* Merge intrigue bits into Runway STATUS 0 */ tmp64 = __raw_readq(runway + RUNWAY_STATUS) & 0xffecfffffffffffful; - __raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul), + __raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul), runway + RUNWAY_STATUS); - + /* Write RUNWAY DEBUG registers */ for (i = 0; i < 8; i++) { __raw_writeq(*memaddr++, runway + RUNWAY_DEBUG); } - return 0; + return 0; } /* @@ -844,7 +848,7 @@ printk("perf_rdr_write\n"); perf_rdr_shift_out_U(rdr_num, buffer[i]); } else { perf_rdr_shift_out_W(rdr_num, buffer[i]); - } + } } printk("perf_rdr_write done\n"); } From 49f1b2c154cb8f412cc8d4fde29dc4baa2e79ee3 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Mar 2017 10:48:13 +0800 Subject: [PATCH 228/555] nfs: make nfs4_cb_sv_ops static [ Upstream commit 05fae7bbc237bc7de0ee9c3dcf85b2572a80e3b5 ] Fixes the following sparse warning: fs/nfs/callback.c:235:21: warning: symbol 'nfs4_cb_sv_ops' was not declared. Should it be static? Signed-off-by: Jason Yan Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/nfs/callback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 582bfee40345..af84a92cb142 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -239,12 +239,12 @@ static struct svc_serv_ops nfs41_cb_sv_ops = { .svo_module = THIS_MODULE, }; -struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = &nfs41_cb_sv_ops, }; #else -struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = NULL, }; From 27848be7eb758dec822f1ca1101fdbc8649182ac Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Wed, 15 Mar 2017 23:38:07 -0400 Subject: [PATCH 229/555] ibmvnic: Free tx/rx scrq pointer array when releasing sub-crqs [ Upstream commit 9501df3cd9204f5859f649182431616a31ee88a1 ] The pointer array for the tx/rx sub crqs should be free'ed when releasing the tx/rx sub crqs. Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/ibm/ibmvnic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 9f2184be55dc..b8778e7b1f79 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1253,6 +1253,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) release_sub_crq_queue(adapter, adapter->tx_scrq[i]); } + kfree(adapter->tx_scrq); adapter->tx_scrq = NULL; } @@ -1265,6 +1266,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) release_sub_crq_queue(adapter, adapter->rx_scrq[i]); } + kfree(adapter->rx_scrq); adapter->rx_scrq = NULL; } From ffb6a7637ce0b35c7c75574a007333db988ea806 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 12 Mar 2017 18:12:56 +0100 Subject: [PATCH 230/555] cpufreq: intel_pstate: Update pid_params.sample_rate_ns in pid_param_set() [ Upstream commit 6e7408acd04d06c04981c0c0fb5a2462b16fae4f ] Fix the debugfs interface for PID tuning to actually update pid_params.sample_rate_ns on PID parameters updates, as changing pid_params.sample_rate_ms via debugfs has no effect now. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 80fa656da5ab..a59ae8e24d3d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -609,6 +609,7 @@ static void intel_pstate_hwp_set_online_cpus(void) static int pid_param_set(void *data, u64 val) { *(u32 *)data = val; + pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC; intel_pstate_reset_all_pid(); return 0; } From 81080d2d83f6b633a363d06a4fa604da20a2c5e6 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 3 Mar 2017 16:02:25 +0800 Subject: [PATCH 231/555] x86/acpi: Restore the order of CPU IDs [ Upstream commit 2b85b3d22920db7473e5fed5719e7955c0ec323e ] The following commits: f7c28833c2 ("x86/acpi: Enable acpi to register all possible cpus at boot time") and 8f54969dc8 ("x86/acpi: Introduce persistent storage for cpuid <-> apicid mapping") ... registered all the possible CPUs at boot time via ACPI tables to make the mapping of cpuid <-> apicid fixed. Both enabled and disabled CPUs could have a logical CPU ID after boot time. But, ACPI tables are unreliable. the number amd order of Local APIC entries which depends on the firmware is often inconsistent with the physical devices. Even if they are consistent, The disabled CPUs which take up some logical CPU IDs will also make the order discontinuous. Revert the part of disabled CPUs registration, keep the allocation logic of logical CPU IDs and also keep some code location changes. Signed-off-by: Dou Liyang Tested-by: Xiaolong Ye Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: guzheng1@huawei.com Cc: izumi.taku@jp.fujitsu.com Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/1488528147-2279-4-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/acpi/boot.c | 7 ++++++- arch/x86/kernel/apic/apic.c | 26 +++++++------------------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d3e0d049a0c2..b89bef95f63b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -176,10 +176,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + if (boot_cpu_physical_apicid != -1U) ver = boot_cpu_apic_version; - cpu = __generic_processor_info(id, ver, enabled); + cpu = generic_processor_info(id, ver); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2234918e494..e2ead34da465 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2070,7 +2070,7 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -int __generic_processor_info(int apicid, int version, bool enabled) +int generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2128,11 +2128,9 @@ int __generic_processor_info(int apicid, int version, bool enabled) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - if (enabled) { - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); - } + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2184,23 +2182,13 @@ int __generic_processor_info(int apicid, int version, bool enabled) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - - if (enabled) { - num_processors++; - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - } else { - disabled_cpus++; - } + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; return cpu; } -int generic_processor_info(int apicid, int version) -{ - return __generic_processor_info(apicid, version, true); -} - int hard_smp_processor_id(void) { return read_apic_id(); From 2d59530d991833c779aa2f6a385b2b27b494240c Mon Sep 17 00:00:00 2001 From: Oleksandr Tyshchenko Date: Mon, 27 Feb 2017 14:30:25 +0200 Subject: [PATCH 232/555] iommu/io-pgtable-arm: Check for leaf entry before dereferencing it [ Upstream commit ed46e66cc1b3d684042f92dfa2ab15ee917b4cac ] Do a check for already installed leaf entry at the current level before dereferencing it in order to avoid walking the page table down with wrong pointer to the next level. Signed-off-by: Oleksandr Tyshchenko CC: Will Deacon CC: Robin Murphy Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/io-pgtable-arm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index f5c90e1366ce..7c9d632f1d09 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -335,8 +335,12 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) pte |= ARM_LPAE_PTE_NSTABLE; __arm_lpae_set_pte(ptep, pte, cfg); - } else { + } else if (!iopte_leaf(pte, lvl)) { cptep = iopte_deref(pte, data); + } else { + /* We require an unmap first */ + WARN_ON(!selftest_running); + return -EEXIST; } /* Rinse, repeat */ From a495f72f8a53b0abdd1ccf0e29844a1e4b1d9407 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 9 Mar 2017 16:17:06 -0800 Subject: [PATCH 233/555] mm/cgroup: avoid panic when init with low memory [ Upstream commit bfc7228b9a9647e1c353e50b40297a2929801759 ] The system may panic when initialisation is done when almost all the memory is assigned to the huge pages using the kernel command line parameter hugepage=xxxx. Panic may occur like this: Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xc000000000302b88 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2048 [ 0.082424] NUMA pSeries Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.9.0-15-generic #16-Ubuntu task: c00000021ed01600 task.stack: c00000010d108000 NIP: c000000000302b88 LR: c000000000270e04 CTR: c00000000016cfd0 REGS: c00000010d10b2c0 TRAP: 0300 Not tainted (4.9.0-15-generic) MSR: 8000000002009033 [ 0.082770] CR: 28424422 XER: 00000000 CFAR: c0000000003d28b8 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1 GPR00: c000000000270e04 c00000010d10b540 c00000000141a300 c00000010fff6300 GPR04: 0000000000000000 00000000026012c0 c00000010d10b630 0000000487ab0000 GPR08: 000000010ee90000 c000000001454fd8 0000000000000000 0000000000000000 GPR12: 0000000000004400 c00000000fb80000 00000000026012c0 00000000026012c0 GPR16: 00000000026012c0 0000000000000000 0000000000000000 0000000000000002 GPR20: 000000000000000c 0000000000000000 0000000000000000 00000000024200c0 GPR24: c0000000016eef48 0000000000000000 c00000010fff7d00 00000000026012c0 GPR28: 0000000000000000 c00000010fff7d00 c00000010fff6300 c00000010d10b6d0 NIP mem_cgroup_soft_limit_reclaim+0xf8/0x4f0 LR do_try_to_free_pages+0x1b4/0x450 Call Trace: do_try_to_free_pages+0x1b4/0x450 try_to_free_pages+0xf8/0x270 __alloc_pages_nodemask+0x7a8/0xff0 new_slab+0x104/0x8e0 ___slab_alloc+0x620/0x700 __slab_alloc+0x34/0x60 kmem_cache_alloc_node_trace+0xdc/0x310 mem_cgroup_init+0x158/0x1c8 do_one_initcall+0x68/0x1d0 kernel_init_freeable+0x278/0x360 kernel_init+0x24/0x170 ret_from_kernel_thread+0x5c/0x74 Instruction dump: eb81ffe0 eba1ffe8 ebc1fff0 ebe1fff8 4e800020 3d230001 e9499a42 3d220004 3929acd8 794a1f24 7d295214 eac90100 2fa90000 419eff74 3b200000 ---[ end trace 342f5208b00d01b6 ]--- This is a chicken and egg issue where the kernel try to get free memory when allocating per node data in mem_cgroup_init(), but in that path mem_cgroup_soft_limit_reclaim() is called which assumes that these data are allocated. As mem_cgroup_soft_limit_reclaim() is best effort, it should return when these data are not yet allocated. This patch also fixes potential null pointer access in mem_cgroup_remove_from_trees() and mem_cgroup_update_tree(). Link: http://lkml.kernel.org/r/1487856999-16581-2-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Laurent Dufour Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Balbir Singh Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47559cc0cdcc..2a800c4a39bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -462,6 +462,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) struct mem_cgroup_tree_per_node *mctz; mctz = soft_limit_tree_from_page(page); + if (!mctz) + return; /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. @@ -499,7 +501,8 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) for_each_node(nid) { mz = mem_cgroup_nodeinfo(memcg, nid); mctz = soft_limit_tree_node(nid); - mem_cgroup_remove_exceeded(mz, mctz); + if (mctz) + mem_cgroup_remove_exceeded(mz, mctz); } } @@ -2565,7 +2568,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * is empty. Do it lockless to prevent lock bouncing. Races * are acceptable as soft limit is best effort anyway. */ - if (RB_EMPTY_ROOT(&mctz->rb_root)) + if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) return 0; /* From 5c6712ab4efb6cf60e16719ab6bcaface9cc268c Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 7 Mar 2017 02:48:36 -0500 Subject: [PATCH 234/555] rds: ib: add error handle [ Upstream commit 3b12f73a5c2977153f28a224392fd4729b50d1dc ] In the function rds_ib_setup_qp, the error handle is missing. When some error occurs, it is possible that memory leak occurs. As such, error handle is added. Cc: Joe Jin Reviewed-by: Junxiao Bi Reviewed-by: Guanglei Li Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/rds/ib_cm.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 5b2ab95afa07..169156cfd4c8 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -405,7 +405,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ret = PTR_ERR(ic->i_send_cq); ic->i_send_cq = NULL; rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; + goto rds_ibdev_out; } cq_attr.cqe = ic->i_recv_ring.w_nr; @@ -416,19 +416,19 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ret = PTR_ERR(ic->i_recv_cq); ic->i_recv_cq = NULL; rdsdebug("ib_create_cq recv failed: %d\n", ret); - goto out; + goto send_cq_out; } ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); if (ret) { rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - goto out; + goto recv_cq_out; } ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); if (ret) { rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); - goto out; + goto recv_cq_out; } /* XXX negotiate max send/recv with remote? */ @@ -453,7 +453,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); if (ret) { rdsdebug("rdma_create_qp failed: %d\n", ret); - goto out; + goto recv_cq_out; } ic->i_send_hdrs = ib_dma_alloc_coherent(dev, @@ -463,7 +463,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); - goto out; + goto qp_out; } ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, @@ -473,7 +473,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); - goto out; + goto send_hdrs_dma_out; } ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), @@ -481,7 +481,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); - goto out; + goto recv_hdrs_dma_out; } ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), @@ -489,7 +489,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); - goto out; + goto ack_dma_out; } ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), @@ -497,7 +497,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); - goto out; + goto sends_out; } rds_ib_recv_init_ack(ic); @@ -505,8 +505,33 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, ic->i_send_cq, ic->i_recv_cq); -out: + return ret; + +sends_out: + vfree(ic->i_sends); +ack_dma_out: + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); +recv_hdrs_dma_out: + ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, ic->i_recv_hdrs_dma); +send_hdrs_dma_out: + ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, ic->i_send_hdrs_dma); +qp_out: + rdma_destroy_qp(ic->i_cm_id); +recv_cq_out: + if (!ib_destroy_cq(ic->i_recv_cq)) + ic->i_recv_cq = NULL; +send_cq_out: + if (!ib_destroy_cq(ic->i_send_cq)) + ic->i_send_cq = NULL; +rds_ibdev_out: + rds_ib_remove_conn(rds_ibdev, conn); rds_ib_dev_put(rds_ibdev); + return ret; } From 4131c889c27843199874d7f2ba3442190cce2b41 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 23 Feb 2017 12:26:41 -0800 Subject: [PATCH 235/555] md/raid10: submit bio directly to replacement disk [ Upstream commit 6d399783e9d4e9bd44931501948059d24ad96ff8 ] Commit 57c67df(md/raid10: submit IO from originating thread instead of md thread) submits bio directly for normal disks but not for replacement disks. There is no point we shouldn't do this for replacement disks. Cc: NeilBrown Signed-off-by: Shaohua Li Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid10.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4c4aab02e311..b19b551bb34b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1407,11 +1407,24 @@ retry_write: mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } } From 625cb13a89295b298d6e0f323cfa2882fb5c05b6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Mar 2017 19:53:47 +0100 Subject: [PATCH 236/555] netfilter: nf_tables: set pktinfo->thoff at AH header if found [ Upstream commit 568af6de058cb2b0c5b98d98ffcf37cdc6bc38a7 ] Phil Sutter reports that IPv6 AH header matching is broken. From userspace, nft generates bytecode that expects to find the AH header at NFT_PAYLOAD_TRANSPORT_HEADER both for IPv4 and IPv6. However, pktinfo->thoff is set to the inner header after the AH header in IPv6, while in IPv4 pktinfo->thoff points to the AH header indeed. This behaviour is inconsistent. This patch fixes this problem by updating ipv6_find_hdr() to get the IP6_FH_F_AUTH flag so this function stops at the AH header, so both IPv4 and IPv6 pktinfo->thoff point to the AH header. This is also inconsistent when trying to match encapsulated headers: 1) A packet that looks like IPv4 + AH + TCP dport 22 will *not* match. 2) A packet that looks like IPv6 + AH + TCP dport 22 will match. Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_tables_ipv6.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index d150b5066201..97983d1c05e4 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -9,12 +9,13 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { + unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; nft_set_pktinfo(pkt, skb, state); - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) { nft_set_pktinfo_proto_unspec(pkt, skb); return; @@ -32,6 +33,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_IPV6) + unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; @@ -50,7 +52,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, if (pkt_len + sizeof(*ip6h) > skb->len) return -1; - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) return -1; From 943411be40e089ca72d848b222bbeea3d0d48463 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 7 Mar 2017 21:06:38 +0100 Subject: [PATCH 237/555] i2c: meson: fix wrong variable usage in meson_i2c_put_data [ Upstream commit 3b0277f198ac928f323c42e180680d2f79aa980d ] Most likely a copy & paste error. Signed-off-by: Heiner Kallweit Acked-by: Jerome Brunet Signed-off-by: Wolfram Sang Fixes: 30021e3707a7 ("i2c: add support for Amlogic Meson I2C controller") Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-meson.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-meson.c b/drivers/i2c/busses/i2c-meson.c index 2aa61bbbd307..73b97c71a484 100644 --- a/drivers/i2c/busses/i2c-meson.c +++ b/drivers/i2c/busses/i2c-meson.c @@ -175,7 +175,7 @@ static void meson_i2c_put_data(struct meson_i2c *i2c, char *buf, int len) wdata1 |= *buf++ << ((i - 4) * 8); writel(wdata0, i2c->regs + REG_TOK_WDATA0); - writel(wdata0, i2c->regs + REG_TOK_WDATA1); + writel(wdata1, i2c->regs + REG_TOK_WDATA1); dev_dbg(i2c->dev, "%s: data %08x %08x len %d\n", __func__, wdata0, wdata1, len); From d86f4ea83626d4714c946ffef643a54519987934 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 6 Mar 2017 11:58:20 -0800 Subject: [PATCH 238/555] xfs: remove kmem_zalloc_greedy [ Upstream commit 08b005f1333154ae5b404ca28766e0ffb9f1c150 ] The sole remaining caller of kmem_zalloc_greedy is bulkstat, which uses it to grab 1-4 pages for staging of inobt records. The infinite loop in the greedy allocation function is causing hangs[1] in generic/269, so just get rid of the greedy allocator in favor of kmem_zalloc_large. This makes bulkstat somewhat more likely to ENOMEM if there's really no pages to spare, but eliminates a source of hangs. [1] http://lkml.kernel.org/r/20170301044634.rgidgdqqiiwsmfpj%40XZHOUW.usersys.redhat.com Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/kmem.c | 18 ------------------ fs/xfs/kmem.h | 2 -- fs/xfs/xfs_itable.c | 6 ++---- 3 files changed, 2 insertions(+), 24 deletions(-) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 339c696bbc01..bb2beaef531a 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -24,24 +24,6 @@ #include "kmem.h" #include "xfs_message.h" -/* - * Greedy allocation. May fail and may return vmalloced memory. - */ -void * -kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) -{ - void *ptr; - size_t kmsize = maxsize; - - while (!(ptr = vzalloc(kmsize))) { - if ((kmsize >>= 1) <= minsize) - kmsize = minsize; - } - if (ptr) - *size = kmsize; - return ptr; -} - void * kmem_alloc(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 689f746224e7..f0fc84fcaac2 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -69,8 +69,6 @@ static inline void kmem_free(const void *ptr) } -extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); - static inline void * kmem_zalloc(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index d8a77dbf4e3a..26d67ce3c18d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -361,7 +361,6 @@ xfs_bulkstat( xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ - size_t irbsize; /* size of irec buffer in bytes */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ int nirbuf; /* size of irbuf */ int ubcount; /* size of user's buffer */ @@ -388,11 +387,10 @@ xfs_bulkstat( *ubcountp = 0; *done = 0; - irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP); if (!irbuf) return -ENOMEM; - - nirbuf = irbsize / sizeof(*irbuf); + nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf); /* * Loop over the allocation groups, starting from the last From 49c3226c06574bd57d6c4a9aec6641d28fda5b52 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 6 Mar 2017 16:54:33 +0000 Subject: [PATCH 239/555] ASoC: wm_adsp: Return an error on write to a disabled volatile control [ Upstream commit 67430a39ca7a6af28aade5acb92d43ee257c1014 ] Volatile controls should only be accessed when the firmware is active, currently however writes to these controls will succeed, but the data will be lost, if the firmware is powered down. Update this behaviour such that an error is returned the same as it is for reads. Signed-off-by: Charles Keepax Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm_adsp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index b943dde8dbe5..3bdd81930486 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -789,7 +789,10 @@ static int wm_coeff_put(struct snd_kcontrol *kctl, mutex_lock(&ctl->dsp->pwr_lock); - memcpy(ctl->cache, p, ctl->len); + if (ctl->flags & WMFW_CTL_FLAG_VOLATILE) + ret = -EPERM; + else + memcpy(ctl->cache, p, ctl->len); ctl->set = 1; if (ctl->enabled && ctl->dsp->running) @@ -816,6 +819,8 @@ static int wm_coeff_tlv_put(struct snd_kcontrol *kctl, ctl->set = 1; if (ctl->enabled && ctl->dsp->running) ret = wm_coeff_write_control(ctl, ctl->cache, size); + else if (ctl->flags & WMFW_CTL_FLAG_VOLATILE) + ret = -EPERM; } mutex_unlock(&ctl->dsp->pwr_lock); From 16db9205d3f89911b4d0d92a84049b70213a8927 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 3 Mar 2017 09:00:09 -0800 Subject: [PATCH 240/555] libata: transport: Remove circular dependency at free time [ Upstream commit d85fc67dd11e9a32966140677d4d6429ca540b25 ] Without this patch, failed probe would not free resources like irq. ata port tdev object currently hold a reference to the ata port object. Therefore the ata port object release function will not get called until the ata_tport_release is called. But that would never happen, releasing the last reference of ata port dev is done by scsi_host_release, which is called by ata_host_release when the ata port object is released. The ata device objects actually do not need to explicitly hold a reference to their real counterpart, given the transport objects are the children of these objects and device_add() is call for each child. We know the parent will not be deleted until we call the child's device_del(). Reported-by: Matthew Whitehead Tested-by: Matthew Whitehead Suggested-by: Tejun Heo Signed-off-by: Gwendal Grignou Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-transport.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c index 7ef16c085058..20e2b7ad8925 100644 --- a/drivers/ata/libata-transport.c +++ b/drivers/ata/libata-transport.c @@ -224,7 +224,6 @@ static DECLARE_TRANSPORT_CLASS(ata_port_class, static void ata_tport_release(struct device *dev) { - put_device(dev->parent); } /** @@ -284,7 +283,7 @@ int ata_tport_add(struct device *parent, device_initialize(dev); dev->type = &ata_port_type; - dev->parent = get_device(parent); + dev->parent = parent; dev->release = ata_tport_release; dev_set_name(dev, "ata%d", ap->print_id); transport_setup_device(dev); @@ -348,7 +347,6 @@ static DECLARE_TRANSPORT_CLASS(ata_link_class, static void ata_tlink_release(struct device *dev) { - put_device(dev->parent); } /** @@ -410,7 +408,7 @@ int ata_tlink_add(struct ata_link *link) int error; device_initialize(dev); - dev->parent = get_device(&ap->tdev); + dev->parent = &ap->tdev; dev->release = ata_tlink_release; if (ata_is_host_link(link)) dev_set_name(dev, "link%d", ap->print_id); @@ -589,7 +587,6 @@ static DECLARE_TRANSPORT_CLASS(ata_dev_class, static void ata_tdev_release(struct device *dev) { - put_device(dev->parent); } /** @@ -662,7 +659,7 @@ static int ata_tdev_add(struct ata_device *ata_dev) int error; device_initialize(dev); - dev->parent = get_device(&link->tdev); + dev->parent = &link->tdev; dev->release = ata_tdev_release; if (ata_is_host_link(link)) dev_set_name(dev, "dev%d.%d", ap->print_id,ata_dev->devno); From c126bc6b94ddbc94af3382e31eeb927e7d6bc533 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Wed, 8 Feb 2017 15:37:12 -0500 Subject: [PATCH 241/555] ARM: dts: BCM5301X: Fix memory start address [ Upstream commit 88d1fa70c21d7b431386cfe70cdc514d98b0c9c4 ] Memory starts at 0x80000000, not 0. 0 "works" due to mirrior of the first 128M of RAM to that address. Anything greater than 128M will quickly find nothing there. Correcting the starting address has everything working again. Signed-off-by: Jon Mason Fixes: 7eb05f6d ("ARM: dts: bcm5301x: Add BCM SVK DT files") Signed-off-by: Florian Fainelli Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/bcm953012k.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/bcm953012k.dts b/arch/arm/boot/dts/bcm953012k.dts index 05a985a20378..6208e85acd9d 100644 --- a/arch/arm/boot/dts/bcm953012k.dts +++ b/arch/arm/boot/dts/bcm953012k.dts @@ -48,7 +48,7 @@ }; memory { - reg = <0x00000000 0x10000000>; + reg = <0x80000000 0x10000000>; }; }; From 86c469bea4ae129433d119b7eb178d40421db045 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 4 Mar 2017 15:42:48 -0500 Subject: [PATCH 242/555] tools/power turbostat: bugfix: GFXMHz column not changing [ Upstream commit 22048c5485503749754b3b5daf9d99ef89fcacdc ] turbostat displays a GFXMHz column, which comes from reading /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz But GFXMHz was not changing, even when a manual cat /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz showed a new value. It turns out that a rewind() on the open file is not sufficient, fflush() (or a close/open) is needed to read fresh values. Reported-by: Yaroslav Isakov Signed-off-by: Len Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/power/x86/turbostat/turbostat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 3e199b508a96..9664b1ff4285 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2003,8 +2003,10 @@ int snapshot_gfx_mhz(void) if (fp == NULL) fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); - else + else { rewind(fp); + fflush(fp); + } retval = fscanf(fp, "%d", &gfx_cur_mhz); if (retval != 1) From a4f11d61e305a35d223c13fdbc3ca00fe9be9d13 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Mar 2017 13:18:45 +0100 Subject: [PATCH 243/555] IB/qib: fix false-postive maybe-uninitialized warning commit f6aafac184a3e46e919769dd4faa8bf0dc436534 upstream. aarch64-linux-gcc-7 complains about code it doesn't fully understand: drivers/infiniband/hw/qib/qib_iba7322.c: In function 'qib_7322_txchk_change': include/asm-generic/bitops/non-atomic.h:105:35: error: 'shadow' may be used uninitialized in this function [-Werror=maybe-uninitialized] The code is right, and despite trying hard, I could not come up with a version that I liked better than just adding a fake initialization here to shut up the warning. Fixes: f931551bafe1 ("IB/qib: Add new qib driver for QLogic PCIe InfiniBand adapters") Signed-off-by: Arnd Bergmann Acked-by: Ira Weiny Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/qib/qib_iba7322.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index ded27172320e..cedb447d0162 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -7080,7 +7080,7 @@ static void qib_7322_txchk_change(struct qib_devdata *dd, u32 start, unsigned long flags; while (wait) { - unsigned long shadow; + unsigned long shadow = 0; int cstart, previ = -1; /* From d8ba70c0940782262adb8955198d9ae787afb7d7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 May 2017 13:50:16 +0200 Subject: [PATCH 244/555] ARM: remove duplicate 'const' annotations' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0527873b29b077fc8e656acd63e1866b429fef55 upstream. gcc-7 warns about some declarations that are more 'const' than necessary: arch/arm/mach-at91/pm.c:338:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const ramc_ids[] __initconst = { arch/arm/mach-bcm/bcm_kona_smc.c:36:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const bcm_kona_smc_ids[] __initconst = { arch/arm/mach-spear/time.c:207:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const timer_of_match[] __initconst = { arch/arm/mach-omap2/prm_common.c:714:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const omap_prcm_dt_match_table[] __initconst = { arch/arm/mach-omap2/vc.c:562:35: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct i2c_init_data const omap4_i2c_timing_data[] __initconst = { The ones in arch/arm were apparently all introduced accidentally by one commit that correctly marked a lot of variables as __initconst. Fixes: 19c233b79d1a ("ARM: appropriate __init annotation for const data") Acked-by: Alexandre Belloni Acked-by: Tony Lindgren Acked-by: Nicolas Pitre Acked-by: Florian Fainelli Acked-by: Viresh Kumar Acked-by: Krzysztof Hałasa Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-at91/pm.c | 2 +- arch/arm/mach-bcm/bcm_kona_smc.c | 2 +- arch/arm/mach-cns3xxx/core.c | 2 +- arch/arm/mach-omap2/prm_common.c | 2 +- arch/arm/mach-omap2/vc.c | 2 +- arch/arm/mach-spear/time.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 31dde8b6f2ea..8ba0e2e5ad97 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -335,7 +335,7 @@ static void at91sam9_sdram_standby(void) at91_ramc_write(1, AT91_SDRAMC_LPR, saved_lpr1); } -static const struct of_device_id const ramc_ids[] __initconst = { +static const struct of_device_id ramc_ids[] __initconst = { { .compatible = "atmel,at91rm9200-sdramc", .data = at91rm9200_standby }, { .compatible = "atmel,at91sam9260-sdramc", .data = at91sam9_sdram_standby }, { .compatible = "atmel,at91sam9g45-ddramc", .data = at91_ddr_standby }, diff --git a/arch/arm/mach-bcm/bcm_kona_smc.c b/arch/arm/mach-bcm/bcm_kona_smc.c index cf3f8658f0e5..a55a7ecf146a 100644 --- a/arch/arm/mach-bcm/bcm_kona_smc.c +++ b/arch/arm/mach-bcm/bcm_kona_smc.c @@ -33,7 +33,7 @@ struct bcm_kona_smc_data { unsigned result; }; -static const struct of_device_id const bcm_kona_smc_ids[] __initconst = { +static const struct of_device_id bcm_kona_smc_ids[] __initconst = { {.compatible = "brcm,kona-smc"}, {.compatible = "bcm,kona-smc"}, /* deprecated name */ {}, diff --git a/arch/arm/mach-cns3xxx/core.c b/arch/arm/mach-cns3xxx/core.c index 03da3813f1ab..7d5a44a06648 100644 --- a/arch/arm/mach-cns3xxx/core.c +++ b/arch/arm/mach-cns3xxx/core.c @@ -346,7 +346,7 @@ static struct usb_ohci_pdata cns3xxx_usb_ohci_pdata = { .power_off = csn3xxx_usb_power_off, }; -static const struct of_dev_auxdata const cns3xxx_auxdata[] __initconst = { +static const struct of_dev_auxdata cns3xxx_auxdata[] __initconst = { { "intel,usb-ehci", CNS3XXX_USB_BASE, "ehci-platform", &cns3xxx_usb_ehci_pdata }, { "intel,usb-ohci", CNS3XXX_USB_OHCI_BASE, "ohci-platform", &cns3xxx_usb_ohci_pdata }, { "cavium,cns3420-ahci", CNS3XXX_SATA2_BASE, "ahci", NULL }, diff --git a/arch/arm/mach-omap2/prm_common.c b/arch/arm/mach-omap2/prm_common.c index 5b2f5138d938..f1ca9479491b 100644 --- a/arch/arm/mach-omap2/prm_common.c +++ b/arch/arm/mach-omap2/prm_common.c @@ -713,7 +713,7 @@ static struct omap_prcm_init_data scrm_data __initdata = { }; #endif -static const struct of_device_id const omap_prcm_dt_match_table[] __initconst = { +static const struct of_device_id omap_prcm_dt_match_table[] __initconst = { #ifdef CONFIG_SOC_AM33XX { .compatible = "ti,am3-prcm", .data = &am3_prm_data }, #endif diff --git a/arch/arm/mach-omap2/vc.c b/arch/arm/mach-omap2/vc.c index 2028167fff31..d76b1e5eb8ba 100644 --- a/arch/arm/mach-omap2/vc.c +++ b/arch/arm/mach-omap2/vc.c @@ -559,7 +559,7 @@ struct i2c_init_data { u8 hsscll_12; }; -static const struct i2c_init_data const omap4_i2c_timing_data[] __initconst = { +static const struct i2c_init_data omap4_i2c_timing_data[] __initconst = { { .load = 50, .loadbits = 0x3, diff --git a/arch/arm/mach-spear/time.c b/arch/arm/mach-spear/time.c index 9ccffc1d0f28..aaaa6781b9fe 100644 --- a/arch/arm/mach-spear/time.c +++ b/arch/arm/mach-spear/time.c @@ -204,7 +204,7 @@ static void __init spear_clockevent_init(int irq) setup_irq(irq, &spear_timer_irq); } -static const struct of_device_id const timer_of_match[] __initconst = { +static const struct of_device_id timer_of_match[] __initconst = { { .compatible = "st,spear-timer", }, { }, }; From 2f4835ee55058c44f2768cc1a5566e75c8851705 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 May 2017 13:44:38 +0200 Subject: [PATCH 245/555] ASoC: rt5514: fix gcc-7 warning commit 03ba791df98d15d07ea74075122af71e35c7611c upstream. gcc-7 warns that there is a duplicate 'const' specifier in some variables that are declared using the SOC_ENUM_SINGLE_DECL macro: sound/soc/codecs/rt5514.c:398:14: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const SOC_ENUM_SINGLE_DECL( sound/soc/codecs/rt5514.c:405:14: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const SOC_ENUM_SINGLE_DECL( This removes one to fix the warning. Fixes: 4a6180ea7399 ("ASoC: rt5514: add rt5514 codec driver") Signed-off-by: Arnd Bergmann Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/rt5514.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt5514.c b/sound/soc/codecs/rt5514.c index f24b7cfd3a89..e024800213f5 100644 --- a/sound/soc/codecs/rt5514.c +++ b/sound/soc/codecs/rt5514.c @@ -395,14 +395,14 @@ static const char * const rt5514_dmic_src[] = { "DMIC1", "DMIC2" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5514_stereo1_dmic_enum, RT5514_DIG_SOURCE_CTRL, RT5514_AD0_DMIC_INPUT_SEL_SFT, rt5514_dmic_src); static const struct snd_kcontrol_new rt5514_sto1_dmic_mux = SOC_DAPM_ENUM("Stereo1 DMIC Source", rt5514_stereo1_dmic_enum); -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5514_stereo2_dmic_enum, RT5514_DIG_SOURCE_CTRL, RT5514_AD1_DMIC_INPUT_SEL_SFT, rt5514_dmic_src); From 617c7735db3da01b582448f12c93f21ff8904dbe Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 12 Jan 2017 14:15:03 +0100 Subject: [PATCH 246/555] ASoC: rt5659: drop double const commit eae39b5f4269260d5d8b35133ba0f4c5e2895b71 upstream. Drop the const qualifier as it is being added by SOC_ENUM_DOUBLE_DECL() already which is called by SOC_ENUM_SINGLE_DECL() as well as the double const by calls to SOC_VALUE_ENUM_SINGLE_DECL() via SOC_VALUE_ENUM_DOUBLE_DECL). Fixes: commit d3cb2de2479b ("ASoC: rt5659: add rt5659 codec driver") Signed-off-by: Nicholas Mc Guire Signed-off-by: Mark Brown Cc: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/rt5659.c | 86 +++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/sound/soc/codecs/rt5659.c b/sound/soc/codecs/rt5659.c index db54550aed60..635818fcda00 100644 --- a/sound/soc/codecs/rt5659.c +++ b/sound/soc/codecs/rt5659.c @@ -1150,28 +1150,28 @@ static const char * const rt5659_data_select[] = { "L/R", "R/L", "L/L", "R/R" }; -static const SOC_ENUM_SINGLE_DECL(rt5659_if1_01_adc_enum, +static SOC_ENUM_SINGLE_DECL(rt5659_if1_01_adc_enum, RT5659_TDM_CTRL_2, RT5659_DS_ADC_SLOT01_SFT, rt5659_data_select); -static const SOC_ENUM_SINGLE_DECL(rt5659_if1_23_adc_enum, +static SOC_ENUM_SINGLE_DECL(rt5659_if1_23_adc_enum, RT5659_TDM_CTRL_2, RT5659_DS_ADC_SLOT23_SFT, rt5659_data_select); -static const SOC_ENUM_SINGLE_DECL(rt5659_if1_45_adc_enum, +static SOC_ENUM_SINGLE_DECL(rt5659_if1_45_adc_enum, RT5659_TDM_CTRL_2, RT5659_DS_ADC_SLOT45_SFT, rt5659_data_select); -static const SOC_ENUM_SINGLE_DECL(rt5659_if1_67_adc_enum, +static SOC_ENUM_SINGLE_DECL(rt5659_if1_67_adc_enum, RT5659_TDM_CTRL_2, RT5659_DS_ADC_SLOT67_SFT, rt5659_data_select); -static const SOC_ENUM_SINGLE_DECL(rt5659_if2_dac_enum, +static SOC_ENUM_SINGLE_DECL(rt5659_if2_dac_enum, RT5659_DIG_INF23_DATA, RT5659_IF2_DAC_SEL_SFT, rt5659_data_select); -static const SOC_ENUM_SINGLE_DECL(rt5659_if2_adc_enum, +static SOC_ENUM_SINGLE_DECL(rt5659_if2_adc_enum, RT5659_DIG_INF23_DATA, RT5659_IF2_ADC_SEL_SFT, rt5659_data_select); -static const SOC_ENUM_SINGLE_DECL(rt5659_if3_dac_enum, +static SOC_ENUM_SINGLE_DECL(rt5659_if3_dac_enum, RT5659_DIG_INF23_DATA, RT5659_IF3_DAC_SEL_SFT, rt5659_data_select); -static const SOC_ENUM_SINGLE_DECL(rt5659_if3_adc_enum, +static SOC_ENUM_SINGLE_DECL(rt5659_if3_adc_enum, RT5659_DIG_INF23_DATA, RT5659_IF3_ADC_SEL_SFT, rt5659_data_select); static const struct snd_kcontrol_new rt5659_if1_01_adc_swap_mux = @@ -1207,31 +1207,31 @@ static unsigned int rt5659_asrc_clk_map_values[] = { 0, 1, 2, 3, 5, 6, }; -static const SOC_VALUE_ENUM_SINGLE_DECL( +static SOC_VALUE_ENUM_SINGLE_DECL( rt5659_da_sto_asrc_enum, RT5659_ASRC_2, RT5659_DA_STO_T_SFT, 0x7, rt5659_asrc_clk_src, rt5659_asrc_clk_map_values); -static const SOC_VALUE_ENUM_SINGLE_DECL( +static SOC_VALUE_ENUM_SINGLE_DECL( rt5659_da_monol_asrc_enum, RT5659_ASRC_2, RT5659_DA_MONO_L_T_SFT, 0x7, rt5659_asrc_clk_src, rt5659_asrc_clk_map_values); -static const SOC_VALUE_ENUM_SINGLE_DECL( +static SOC_VALUE_ENUM_SINGLE_DECL( rt5659_da_monor_asrc_enum, RT5659_ASRC_2, RT5659_DA_MONO_R_T_SFT, 0x7, rt5659_asrc_clk_src, rt5659_asrc_clk_map_values); -static const SOC_VALUE_ENUM_SINGLE_DECL( +static SOC_VALUE_ENUM_SINGLE_DECL( rt5659_ad_sto1_asrc_enum, RT5659_ASRC_2, RT5659_AD_STO1_T_SFT, 0x7, rt5659_asrc_clk_src, rt5659_asrc_clk_map_values); -static const SOC_VALUE_ENUM_SINGLE_DECL( +static SOC_VALUE_ENUM_SINGLE_DECL( rt5659_ad_sto2_asrc_enum, RT5659_ASRC_3, RT5659_AD_STO2_T_SFT, 0x7, rt5659_asrc_clk_src, rt5659_asrc_clk_map_values); -static const SOC_VALUE_ENUM_SINGLE_DECL( +static SOC_VALUE_ENUM_SINGLE_DECL( rt5659_ad_monol_asrc_enum, RT5659_ASRC_3, RT5659_AD_MONO_L_T_SFT, 0x7, rt5659_asrc_clk_src, rt5659_asrc_clk_map_values); -static const SOC_VALUE_ENUM_SINGLE_DECL( +static SOC_VALUE_ENUM_SINGLE_DECL( rt5659_ad_monor_asrc_enum, RT5659_ASRC_3, RT5659_AD_MONO_R_T_SFT, 0x7, rt5659_asrc_clk_src, rt5659_asrc_clk_map_values); @@ -1930,14 +1930,14 @@ static const char * const rt5659_dac2_src[] = { "IF1 DAC2", "IF2 DAC", "IF3 DAC", "Mono ADC MIX" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_dac_l2_enum, RT5659_DAC_CTRL, RT5659_DAC_L2_SEL_SFT, rt5659_dac2_src); static const struct snd_kcontrol_new rt5659_dac_l2_mux = SOC_DAPM_ENUM("DAC L2 Source", rt5659_dac_l2_enum); -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_dac_r2_enum, RT5659_DAC_CTRL, RT5659_DAC_R2_SEL_SFT, rt5659_dac2_src); @@ -1951,7 +1951,7 @@ static const char * const rt5659_sto1_adc1_src[] = { "DAC MIX", "ADC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_sto1_adc1_enum, RT5659_STO1_ADC_MIXER, RT5659_STO1_ADC1_SRC_SFT, rt5659_sto1_adc1_src); @@ -1964,7 +1964,7 @@ static const char * const rt5659_sto1_adc_src[] = { "ADC1", "ADC2" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_sto1_adc_enum, RT5659_STO1_ADC_MIXER, RT5659_STO1_ADC_SRC_SFT, rt5659_sto1_adc_src); @@ -1977,7 +1977,7 @@ static const char * const rt5659_sto1_adc2_src[] = { "DAC MIX", "DMIC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_sto1_adc2_enum, RT5659_STO1_ADC_MIXER, RT5659_STO1_ADC2_SRC_SFT, rt5659_sto1_adc2_src); @@ -1990,7 +1990,7 @@ static const char * const rt5659_sto1_dmic_src[] = { "DMIC1", "DMIC2" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_sto1_dmic_enum, RT5659_STO1_ADC_MIXER, RT5659_STO1_DMIC_SRC_SFT, rt5659_sto1_dmic_src); @@ -2004,7 +2004,7 @@ static const char * const rt5659_mono_adc_l2_src[] = { "Mono DAC MIXL", "DMIC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_mono_adc_l2_enum, RT5659_MONO_ADC_MIXER, RT5659_MONO_ADC_L2_SRC_SFT, rt5659_mono_adc_l2_src); @@ -2018,7 +2018,7 @@ static const char * const rt5659_mono_adc_l1_src[] = { "Mono DAC MIXL", "ADC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_mono_adc_l1_enum, RT5659_MONO_ADC_MIXER, RT5659_MONO_ADC_L1_SRC_SFT, rt5659_mono_adc_l1_src); @@ -2031,14 +2031,14 @@ static const char * const rt5659_mono_adc_src[] = { "ADC1 L", "ADC1 R", "ADC2 L", "ADC2 R" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_mono_adc_l_enum, RT5659_MONO_ADC_MIXER, RT5659_MONO_ADC_L_SRC_SFT, rt5659_mono_adc_src); static const struct snd_kcontrol_new rt5659_mono_adc_l_mux = SOC_DAPM_ENUM("Mono ADC L Source", rt5659_mono_adc_l_enum); -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_mono_adcr_enum, RT5659_MONO_ADC_MIXER, RT5659_MONO_ADC_R_SRC_SFT, rt5659_mono_adc_src); @@ -2051,7 +2051,7 @@ static const char * const rt5659_mono_dmic_l_src[] = { "DMIC1 L", "DMIC2 L" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_mono_dmic_l_enum, RT5659_MONO_ADC_MIXER, RT5659_MONO_DMIC_L_SRC_SFT, rt5659_mono_dmic_l_src); @@ -2064,7 +2064,7 @@ static const char * const rt5659_mono_adc_r2_src[] = { "Mono DAC MIXR", "DMIC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_mono_adc_r2_enum, RT5659_MONO_ADC_MIXER, RT5659_MONO_ADC_R2_SRC_SFT, rt5659_mono_adc_r2_src); @@ -2077,7 +2077,7 @@ static const char * const rt5659_mono_adc_r1_src[] = { "Mono DAC MIXR", "ADC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_mono_adc_r1_enum, RT5659_MONO_ADC_MIXER, RT5659_MONO_ADC_R1_SRC_SFT, rt5659_mono_adc_r1_src); @@ -2090,7 +2090,7 @@ static const char * const rt5659_mono_dmic_r_src[] = { "DMIC1 R", "DMIC2 R" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_mono_dmic_r_enum, RT5659_MONO_ADC_MIXER, RT5659_MONO_DMIC_R_SRC_SFT, rt5659_mono_dmic_r_src); @@ -2104,14 +2104,14 @@ static const char * const rt5659_dac1_src[] = { "IF1 DAC1", "IF2 DAC", "IF3 DAC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_dac_r1_enum, RT5659_AD_DA_MIXER, RT5659_DAC1_R_SEL_SFT, rt5659_dac1_src); static const struct snd_kcontrol_new rt5659_dac_r1_mux = SOC_DAPM_ENUM("DAC R1 Source", rt5659_dac_r1_enum); -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_dac_l1_enum, RT5659_AD_DA_MIXER, RT5659_DAC1_L_SEL_SFT, rt5659_dac1_src); @@ -2124,14 +2124,14 @@ static const char * const rt5659_dig_dac_mix_src[] = { "Stereo DAC Mixer", "Mono DAC Mixer" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_dig_dac_mixl_enum, RT5659_DIG_MIXER, RT5659_DAC_MIX_L_SFT, rt5659_dig_dac_mix_src); static const struct snd_kcontrol_new rt5659_dig_dac_mixl_mux = SOC_DAPM_ENUM("DAC Digital Mixer L Source", rt5659_dig_dac_mixl_enum); -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_dig_dac_mixr_enum, RT5659_DIG_MIXER, RT5659_DAC_MIX_R_SFT, rt5659_dig_dac_mix_src); @@ -2144,14 +2144,14 @@ static const char * const rt5659_alg_dac1_src[] = { "DAC", "Stereo DAC Mixer" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_alg_dac_l1_enum, RT5659_A_DAC_MUX, RT5659_A_DACL1_SFT, rt5659_alg_dac1_src); static const struct snd_kcontrol_new rt5659_alg_dac_l1_mux = SOC_DAPM_ENUM("Analog DACL1 Source", rt5659_alg_dac_l1_enum); -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_alg_dac_r1_enum, RT5659_A_DAC_MUX, RT5659_A_DACR1_SFT, rt5659_alg_dac1_src); @@ -2164,14 +2164,14 @@ static const char * const rt5659_alg_dac2_src[] = { "Stereo DAC Mixer", "Mono DAC Mixer" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_alg_dac_l2_enum, RT5659_A_DAC_MUX, RT5659_A_DACL2_SFT, rt5659_alg_dac2_src); static const struct snd_kcontrol_new rt5659_alg_dac_l2_mux = SOC_DAPM_ENUM("Analog DAC L2 Source", rt5659_alg_dac_l2_enum); -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_alg_dac_r2_enum, RT5659_A_DAC_MUX, RT5659_A_DACR2_SFT, rt5659_alg_dac2_src); @@ -2184,7 +2184,7 @@ static const char * const rt5659_if2_adc_in_src[] = { "IF_ADC1", "IF_ADC2", "DAC_REF", "IF_ADC3" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_if2_adc_in_enum, RT5659_DIG_INF23_DATA, RT5659_IF2_ADC_IN_SFT, rt5659_if2_adc_in_src); @@ -2197,7 +2197,7 @@ static const char * const rt5659_if3_adc_in_src[] = { "IF_ADC1", "IF_ADC2", "DAC_REF", "Stereo2_ADC_L/R" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_if3_adc_in_enum, RT5659_DIG_INF23_DATA, RT5659_IF3_ADC_IN_SFT, rt5659_if3_adc_in_src); @@ -2210,14 +2210,14 @@ static const char * const rt5659_pdm_src[] = { "Mono DAC", "Stereo DAC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_pdm_l_enum, RT5659_PDM_OUT_CTRL, RT5659_PDM1_L_SFT, rt5659_pdm_src); static const struct snd_kcontrol_new rt5659_pdm_l_mux = SOC_DAPM_ENUM("PDM L Source", rt5659_pdm_l_enum); -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_pdm_r_enum, RT5659_PDM_OUT_CTRL, RT5659_PDM1_R_SFT, rt5659_pdm_src); @@ -2230,7 +2230,7 @@ static const char * const rt5659_spdif_src[] = { "IF1_DAC1", "IF1_DAC2", "IF2_DAC", "IF3_DAC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_spdif_enum, RT5659_SPDIF_CTRL, RT5659_SPDIF_SEL_SFT, rt5659_spdif_src); @@ -2250,7 +2250,7 @@ static const char * const rt5659_rx_adc_data_src[] = { "NUL:AD2:DAC:AD1", "NUL:DAC:DAC:AD2", "NUL:DAC:AD2:DAC" }; -static const SOC_ENUM_SINGLE_DECL( +static SOC_ENUM_SINGLE_DECL( rt5659_rx_adc_data_enum, RT5659_TDM_CTRL_2, RT5659_ADCDAT_SRC_SFT, rt5659_rx_adc_data_src); From cf2cd9feb8e61fffc448006d0020a7149430cfcd Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 12 Jan 2017 11:48:11 +0100 Subject: [PATCH 247/555] ASoC: rt5660: remove double const commit 4281fcc02ed9f902dfa52d3635ac7f04b1a7341f upstream. Drop the const qualifier as it is being added by SOC_ENUM_DOUBLE_DECL() already which is called by SOC_ENUM_SINGLE_DECL() here. Fixes: commit 2b26dd4c1fc5 ("ASoC: rt5660: add rt5660 codec driver") Signed-off-by: Nicholas Mc Guire Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/rt5660.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt5660.c b/sound/soc/codecs/rt5660.c index 9f0933ced804..e396b7680fa1 100644 --- a/sound/soc/codecs/rt5660.c +++ b/sound/soc/codecs/rt5660.c @@ -526,10 +526,10 @@ static const char * const rt5660_data_select[] = { "L/R", "R/L", "L/L", "R/R" }; -static const SOC_ENUM_SINGLE_DECL(rt5660_if1_dac_enum, +static SOC_ENUM_SINGLE_DECL(rt5660_if1_dac_enum, RT5660_DIG_INF1_DATA, RT5660_IF1_DAC_IN_SFT, rt5660_data_select); -static const SOC_ENUM_SINGLE_DECL(rt5660_if1_adc_enum, +static SOC_ENUM_SINGLE_DECL(rt5660_if1_adc_enum, RT5660_DIG_INF1_DATA, RT5660_IF1_ADC_IN_SFT, rt5660_data_select); static const struct snd_kcontrol_new rt5660_if1_dac_swap_mux = From c637027054ae199bc93d72440019e3a445026988 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Mar 2017 16:15:55 +0100 Subject: [PATCH 248/555] ALSA: au88x0: avoid theoretical uninitialized access commit 13f99ebdd602ebdafb909e15ec6ffb1e34690167 upstream. The latest gcc-7.0.1 snapshot points out that we if nr_ch is zero, we never initialize some variables: sound/pci/au88x0/au88x0_core.c: In function 'vortex_adb_allocroute': sound/pci/au88x0/au88x0_core.c:2304:68: error: 'mix[0]' may be used uninitialized in this function [-Werror=maybe-uninitialized] sound/pci/au88x0/au88x0_core.c:2305:58: error: 'src[0]' may be used uninitialized in this function [-Werror=maybe-uninitialized] I assume this can never happen in practice, but adding a check here doesn't hurt either and avoids the warning. The code has been unchanged since the start of git history. Signed-off-by: Arnd Bergmann Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/au88x0/au88x0_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/au88x0/au88x0_core.c b/sound/pci/au88x0/au88x0_core.c index e1af24f87566..c308a4f70550 100644 --- a/sound/pci/au88x0/au88x0_core.c +++ b/sound/pci/au88x0/au88x0_core.c @@ -2279,6 +2279,9 @@ vortex_adb_allocroute(vortex_t *vortex, int dma, int nr_ch, int dir, } else { int src[2], mix[2]; + if (nr_ch < 1) + return -EINVAL; + /* Get SRC and MIXER hardware resources. */ for (i = 0; i < nr_ch; i++) { if ((mix[i] = From 14b502e491a8ea9257c9cbe4b7b0800e6e9a7473 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Feb 2017 12:51:28 -0200 Subject: [PATCH 249/555] ttpci: address stringop overflow warning commit 69d3973af1acd4c0989ec8218c05f12d303cd7cf upstream. gcc-7.0.1 warns about old code in ttpci: In file included from drivers/media/pci/ttpci/av7110.c:63:0: In function 'irdebi.isra.2', inlined from 'start_debi_dma' at drivers/media/pci/ttpci/av7110.c:376:3, inlined from 'gpioirq' at drivers/media/pci/ttpci/av7110.c:659:3: drivers/media/pci/ttpci/av7110_hw.h:406:3: warning: 'memcpy': specified size between 18446744071562067968 and 18446744073709551615 exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=] memcpy(av7110->debi_virt, (char *) &res, count); In function 'irdebi.isra.2', inlined from 'start_debi_dma' at drivers/media/pci/ttpci/av7110.c:376:3, inlined from 'gpioirq' at drivers/media/pci/ttpci/av7110.c:668:3: drivers/media/pci/ttpci/av7110_hw.h:406:3: warning: 'memcpy': specified size between 18446744071562067968 and 18446744073709551615 exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=] memcpy(av7110->debi_virt, (char *) &res, count); Apparently, 'count' can be negative here, which will then get turned into a giant size argument for memcpy. Changing the sizes to 'unsigned int' instead seems safe as we already check for maximum sizes, and it also simplifies the code a bit. Signed-off-by: Arnd Bergmann Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/pci/ttpci/av7110_hw.c | 8 ++++---- drivers/media/pci/ttpci/av7110_hw.h | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/media/pci/ttpci/av7110_hw.c b/drivers/media/pci/ttpci/av7110_hw.c index 0583d56ef5ef..41ba8489db34 100644 --- a/drivers/media/pci/ttpci/av7110_hw.c +++ b/drivers/media/pci/ttpci/av7110_hw.c @@ -56,11 +56,11 @@ by Nathan Laredo */ int av7110_debiwrite(struct av7110 *av7110, u32 config, - int addr, u32 val, int count) + int addr, u32 val, unsigned int count) { struct saa7146_dev *dev = av7110->dev; - if (count <= 0 || count > 32764) { + if (count > 32764) { printk("%s: invalid count %d\n", __func__, count); return -1; } @@ -78,12 +78,12 @@ int av7110_debiwrite(struct av7110 *av7110, u32 config, return 0; } -u32 av7110_debiread(struct av7110 *av7110, u32 config, int addr, int count) +u32 av7110_debiread(struct av7110 *av7110, u32 config, int addr, unsigned int count) { struct saa7146_dev *dev = av7110->dev; u32 result = 0; - if (count > 32764 || count <= 0) { + if (count > 32764) { printk("%s: invalid count %d\n", __func__, count); return 0; } diff --git a/drivers/media/pci/ttpci/av7110_hw.h b/drivers/media/pci/ttpci/av7110_hw.h index 1634aba5cb84..ccb148059406 100644 --- a/drivers/media/pci/ttpci/av7110_hw.h +++ b/drivers/media/pci/ttpci/av7110_hw.h @@ -377,14 +377,14 @@ extern int av7110_fw_request(struct av7110 *av7110, u16 *request_buf, /* DEBI (saa7146 data extension bus interface) access */ extern int av7110_debiwrite(struct av7110 *av7110, u32 config, - int addr, u32 val, int count); + int addr, u32 val, unsigned int count); extern u32 av7110_debiread(struct av7110 *av7110, u32 config, - int addr, int count); + int addr, unsigned int count); /* DEBI during interrupt */ /* single word writes */ -static inline void iwdebi(struct av7110 *av7110, u32 config, int addr, u32 val, int count) +static inline void iwdebi(struct av7110 *av7110, u32 config, int addr, u32 val, unsigned int count) { av7110_debiwrite(av7110, config, addr, val, count); } @@ -397,7 +397,7 @@ static inline void mwdebi(struct av7110 *av7110, u32 config, int addr, av7110_debiwrite(av7110, config, addr, 0, count); } -static inline u32 irdebi(struct av7110 *av7110, u32 config, int addr, u32 val, int count) +static inline u32 irdebi(struct av7110 *av7110, u32 config, int addr, u32 val, unsigned int count) { u32 res; @@ -408,7 +408,7 @@ static inline u32 irdebi(struct av7110 *av7110, u32 config, int addr, u32 val, i } /* DEBI outside interrupts, only for count <= 4! */ -static inline void wdebi(struct av7110 *av7110, u32 config, int addr, u32 val, int count) +static inline void wdebi(struct av7110 *av7110, u32 config, int addr, u32 val, unsigned int count) { unsigned long flags; @@ -417,7 +417,7 @@ static inline void wdebi(struct av7110 *av7110, u32 config, int addr, u32 val, i spin_unlock_irqrestore(&av7110->debilock, flags); } -static inline u32 rdebi(struct av7110 *av7110, u32 config, int addr, u32 val, int count) +static inline u32 rdebi(struct av7110 *av7110, u32 config, int addr, u32 val, unsigned int count) { unsigned long flags; u32 res; From 75903d40aaec3328e17bc6c89ea96b4fc1c602eb Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 18 Sep 2017 16:10:35 +0200 Subject: [PATCH 250/555] s390/mm: make pmdp_invalidate() do invalidation only commit 91c575b335766effa6103eba42a82aea560c365f upstream. Commit 227be799c39a ("s390/mm: uninline pmdp_xxx functions from pgtable.h") inadvertently changed the behavior of pmdp_invalidate(), so that it now clears the pmd instead of just marking it as invalid. Fix this by restoring the original behavior. A possible impact of the misbehaving pmdp_invalidate() would be the MADV_DONTNEED races (see commits ced10803 and 58ceeb6b), although we should not have any negative impact on the related dirty/young flags, since those flags are not set by the hardware on s390. Fixes: 227be799c39a ("s390/mm: uninline pmdp_xxx functions from pgtable.h") Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/pgtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index d33f245af5e9..db74d398a443 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1359,7 +1359,9 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, static inline void pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID)); + pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); + + pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT From f37eb7b586f1dd24a86c50278c65322fc6787722 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 8 Oct 2017 10:26:32 +0200 Subject: [PATCH 251/555] Linux 4.9.54 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 98e3be659b21..8370937bbb22 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 53 +SUBLEVEL = 54 EXTRAVERSION = NAME = Roaring Lionus From 34e23dee72dd19bef13a4efce094c8e72f1e009b Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 11 Aug 2017 02:11:33 +0900 Subject: [PATCH 252/555] BACKPORT: net: xfrm: support setting an output mark. On systems that use mark-based routing it may be necessary for routing lookups to use marks in order for packets to be routed correctly. An example of such a system is Android, which uses socket marks to route packets via different networks. Currently, routing lookups in tunnel mode always use a mark of zero, making routing incorrect on such systems. This patch adds a new output_mark element to the xfrm state and a corresponding XFRMA_OUTPUT_MARK netlink attribute. The output mark differs from the existing xfrm mark in two ways: 1. The xfrm mark is used to match xfrm policies and states, while the xfrm output mark is used to set the mark (and influence the routing) of the packets emitted by those states. 2. The existing mark is constrained to be a subset of the bits of the originating socket or transformed packet, but the output mark is arbitrary and depends only on the state. The use of a separate mark provides additional flexibility. For example: - A packet subject to two transforms (e.g., transport mode inside tunnel mode) can have two different output marks applied to it, one for the transport mode SA and one for the tunnel mode SA. - On a system where socket marks determine routing, the packets emitted by an IPsec tunnel can be routed based on a mark that is determined by the tunnel, not by the marks of the unencrypted packets. - Support for setting the output marks can be introduced without breaking any existing setups that employ both mark-based routing and xfrm tunnel mode. Simply changing the code to use the xfrm mark for routing output packets could xfrm mark could change behaviour in a way that breaks these setups. If the output mark is unspecified or set to zero, the mark is not set or changed. [backport of upstream 077fbac405bfc6d41419ad6c1725804ad4e9887c] Bug: 63589535 Test: https://android-review.googlesource.com/452776/ passes Tested: make allyesconfig; make -j64 Tested: https://android-review.googlesource.com/452776 Signed-off-by: Lorenzo Colitti Signed-off-by: Steffen Klassert Change-Id: I76120fba036e21780ced31ad390faf491ea81e52 --- include/net/xfrm.h | 7 +++++-- include/uapi/linux/xfrm.h | 2 ++ net/ipv4/xfrm4_policy.c | 14 +++++++++----- net/ipv6/xfrm6_policy.c | 9 ++++++--- net/xfrm/xfrm_output.c | 3 +++ net/xfrm/xfrm_policy.c | 17 +++++++++-------- net/xfrm/xfrm_user.c | 11 +++++++++++ 7 files changed, 45 insertions(+), 18 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 835c30e491c8..9b6e6a46aed9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -155,6 +155,7 @@ struct xfrm_state { int header_len; int trailer_len; u32 extra_flags; + u32 output_mark; } props; struct xfrm_lifetime_cfg lft; @@ -284,10 +285,12 @@ struct xfrm_policy_afinfo { struct dst_entry *(*dst_lookup)(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr); + const xfrm_address_t *daddr, + u32 mark); int (*get_saddr)(struct net *net, int oif, xfrm_address_t *saddr, - xfrm_address_t *daddr); + xfrm_address_t *daddr, + u32 mark); void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 1fc62b239f1b..7d75e5633641 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -303,6 +303,8 @@ enum xfrm_attr_type_t { XFRMA_PROTO, /* __u8 */ XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, + XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ + XFRMA_OUTPUT_MARK, /* __u32 */ __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6a7ff6957535..7f9a8dfcfe18 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -22,7 +22,8 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct rtable *rt; @@ -30,6 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); + fl4->flowi4_mark = mark; if (saddr) fl4->saddr = saddr->a4; @@ -44,20 +46,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct flowi4 fl4; - return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr); + return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark); } static int xfrm4_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr) + xfrm_address_t *saddr, xfrm_address_t *daddr, + u32 mark) { struct dst_entry *dst; struct flowi4 fl4; - dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index e0f71c01d728..4003b2806a68 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -29,7 +29,8 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo; static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct flowi6 fl6; struct dst_entry *dst; @@ -38,6 +39,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif); fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; + fl6.flowi6_mark = mark; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); @@ -54,12 +56,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, } static int xfrm6_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr) + xfrm_address_t *saddr, xfrm_address_t *daddr, + u32 mark) { struct dst_entry *dst; struct net_device *dev; - dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr); + dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 637387bbaaea..d864a6d9e517 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,6 +66,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto error_nolock; } + if (x->props.output_mark) + skb->mark = x->props.output_mark; + err = x->outer_mode->output(x, skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8ce5711ea21b..77fbfbdeffc8 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -125,7 +125,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, - int family) + int family, u32 mark) { struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; @@ -134,7 +134,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); - dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr); + dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); xfrm_policy_put_afinfo(afinfo); @@ -145,7 +145,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, - int family) + int family, u32 mark) { struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; @@ -161,7 +161,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, daddr = x->coaddr; } - dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family); + dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); if (!IS_ERR(dst)) { if (prev_saddr != saddr) @@ -1427,14 +1427,14 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) static int xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, - xfrm_address_t *remote, unsigned short family) + xfrm_address_t *remote, unsigned short family, u32 mark) { int err; struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; - err = afinfo->get_saddr(net, oif, local, remote); + err = afinfo->get_saddr(net, oif, local, remote, mark); xfrm_policy_put_afinfo(afinfo); return err; } @@ -1465,7 +1465,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, if (xfrm_addr_any(local, tmpl->encap_family)) { error = xfrm_get_saddr(net, fl->flowi_oif, &tmp, remote, - tmpl->encap_family); + tmpl->encap_family, 0); if (error) goto fail; local = &tmp; @@ -1744,7 +1744,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, - &saddr, &daddr, family); + &saddr, &daddr, family, + xfrm[i]->props.output_mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b2bba35e90b6..ed4f571428a5 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -584,6 +584,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); + if (attrs[XFRMA_OUTPUT_MARK]) + x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]); + err = __xfrm_init_state(x, false); if (err) goto error; @@ -871,6 +874,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, goto out; if (x->security) ret = copy_sec_ctx(x->security, skb); + if (x->props.output_mark) { + ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark); + if (ret) + goto out; + } out: return ret; } @@ -2419,6 +2427,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, + [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2635,6 +2644,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(*x->coaddr)); if (x->props.extra_flags) l += nla_total_size(sizeof(x->props.extra_flags)); + if (x->props.output_mark) + l += nla_total_size(sizeof(x->props.output_mark)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); From 9add7c47ec5557606381ada373bebaff3b57a4d5 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Wed, 27 Sep 2017 15:12:25 +0800 Subject: [PATCH 253/555] ANDROID: binder: init desired_prio.sched_policy before use it In function binder_transaction_priority(), we access desired_prio before initialzing it. This patch fix this. Change-Id: I9d14d50f9a128010476a65b52631630899a44633 Signed-off-by: Ganesh Mahendran --- drivers/android/binder.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 256a1d5bcfc6..cde60960fe65 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1188,7 +1188,7 @@ static void binder_transaction_priority(struct task_struct *task, struct binder_priority node_prio, bool inherit_rt) { - struct binder_priority desired_prio; + struct binder_priority desired_prio = t->priority; if (t->set_priority_called) return; @@ -1200,9 +1200,6 @@ static void binder_transaction_priority(struct task_struct *task, if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) { desired_prio.prio = NICE_TO_PRIO(0); desired_prio.sched_policy = SCHED_NORMAL; - } else { - desired_prio.prio = t->priority.prio; - desired_prio.sched_policy = t->priority.sched_policy; } if (node_prio.prio < t->priority.prio || From fd5336c0d1e35dfac610a5b81484f017cdc3b250 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 13:23:58 -0400 Subject: [PATCH 254/555] USB: gadgetfs: Fix crash caused by inadequate synchronization commit 520b72fc64debf8a86c3853b8e486aa5982188f0 upstream. The gadgetfs driver (drivers/usb/gadget/legacy/inode.c) was written before the UDC and composite frameworks were adopted; it is a legacy driver. As such, it expects that once bound to a UDC controller, it will not be unbound until it unregisters itself. However, the UDC framework does unbind function drivers while they are still registered. When this happens, it can cause the gadgetfs driver to misbehave or crash. For example, userspace can cause a crash by opening the device file and doing an ioctl call before setting up a configuration (found by Andrey Konovalov using the syzkaller fuzzer). This patch adds checks and synchronization to prevent these bad behaviors. It adds a udc_usage counter that the driver increments at times when it is using a gadget interface without holding the private spinlock. The unbind routine waits for this counter to go to 0 before returning, thereby ensuring that the UDC is no longer in use. The patch also adds a check in the dev_ioctl() routine to make sure the driver is bound to a UDC before dereferencing the gadget pointer, and it makes destroy_ep_files() synchronize with the endpoint I/O routines, to prevent the user from accessing an endpoint data structure after it has been removed. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 41 +++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index f959c42ecace..6ee73af35119 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -27,7 +27,7 @@ #include #include #include - +#include #include #include @@ -116,6 +116,7 @@ enum ep0_state { struct dev_data { spinlock_t lock; atomic_t count; + int udc_usage; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; @@ -513,9 +514,9 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) INIT_WORK(&priv->work, ep_user_copy_worker); schedule_work(&priv->work); } - spin_unlock(&epdata->dev->lock); usb_ep_free_request(ep, req); + spin_unlock(&epdata->dev->lock); put_ep(epdata); } @@ -939,9 +940,11 @@ ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) struct usb_request *req = dev->req; if ((retval = setup_req (ep, req, 0)) == 0) { + ++dev->udc_usage; spin_unlock_irq (&dev->lock); retval = usb_ep_queue (ep, req, GFP_KERNEL); spin_lock_irq (&dev->lock); + --dev->udc_usage; } dev->state = STATE_DEV_CONNECTED; @@ -1131,6 +1134,7 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) retval = setup_req (dev->gadget->ep0, dev->req, len); if (retval == 0) { dev->state = STATE_DEV_CONNECTED; + ++dev->udc_usage; spin_unlock_irq (&dev->lock); if (copy_from_user (dev->req->buf, buf, len)) retval = -EFAULT; @@ -1142,6 +1146,7 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) GFP_KERNEL); } spin_lock_irq(&dev->lock); + --dev->udc_usage; if (retval < 0) { clean_req (dev->gadget->ep0, dev->req); } else @@ -1243,9 +1248,21 @@ static long dev_ioctl (struct file *fd, unsigned code, unsigned long value) struct usb_gadget *gadget = dev->gadget; long ret = -ENOTTY; - if (gadget->ops->ioctl) + spin_lock_irq(&dev->lock); + if (dev->state == STATE_DEV_OPENED || + dev->state == STATE_DEV_UNBOUND) { + /* Not bound to a UDC */ + } else if (gadget->ops->ioctl) { + ++dev->udc_usage; + spin_unlock_irq(&dev->lock); + ret = gadget->ops->ioctl (gadget, code, value); + spin_lock_irq(&dev->lock); + --dev->udc_usage; + } + spin_unlock_irq(&dev->lock); + return ret; } @@ -1463,10 +1480,12 @@ delegate: if (value < 0) break; + ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, dev->req, GFP_KERNEL); spin_lock (&dev->lock); + --dev->udc_usage; if (value < 0) { clean_req (gadget->ep0, dev->req); break; @@ -1490,8 +1509,12 @@ delegate: req->length = value; req->zero = value < w_length; + ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, req, GFP_KERNEL); + spin_lock(&dev->lock); + --dev->udc_usage; + spin_unlock(&dev->lock); if (value < 0) { DBG (dev, "ep_queue --> %d\n", value); req->status = 0; @@ -1518,21 +1541,24 @@ static void destroy_ep_files (struct dev_data *dev) /* break link to FS */ ep = list_first_entry (&dev->epfiles, struct ep_data, epfiles); list_del_init (&ep->epfiles); + spin_unlock_irq (&dev->lock); + dentry = ep->dentry; ep->dentry = NULL; parent = d_inode(dentry->d_parent); /* break link to controller */ + mutex_lock(&ep->lock); if (ep->state == STATE_EP_ENABLED) (void) usb_ep_disable (ep->ep); ep->state = STATE_EP_UNBOUND; usb_ep_free_request (ep->ep, ep->req); ep->ep = NULL; + mutex_unlock(&ep->lock); + wake_up (&ep->wait); put_ep (ep); - spin_unlock_irq (&dev->lock); - /* break link to dcache */ inode_lock(parent); d_delete (dentry); @@ -1603,6 +1629,11 @@ gadgetfs_unbind (struct usb_gadget *gadget) spin_lock_irq (&dev->lock); dev->state = STATE_DEV_UNBOUND; + while (dev->udc_usage > 0) { + spin_unlock_irq(&dev->lock); + usleep_range(1000, 2000); + spin_lock_irq(&dev->lock); + } spin_unlock_irq (&dev->lock); destroy_ep_files (dev); From 7f850036134c491361e5297dde8762ac79934a83 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 16:12:01 -0400 Subject: [PATCH 255/555] USB: gadgetfs: fix copy_to_user while holding spinlock commit 6e76c01e71551cb221c1f3deacb9dcd9a7346784 upstream. The gadgetfs driver as a long-outstanding FIXME, regarding a call of copy_to_user() made while holding a spinlock. This patch fixes the issue by dropping the spinlock and using the dev->udc_usage mechanism introduced by another recent patch to guard against status changes while the lock isn't held. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 6ee73af35119..f69dbd4bcd18 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -986,11 +986,14 @@ ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) retval = -EIO; else { len = min (len, (size_t)dev->req->actual); -// FIXME don't call this with the spinlock held ... + ++dev->udc_usage; + spin_unlock_irq(&dev->lock); if (copy_to_user (buf, dev->req->buf, len)) retval = -EFAULT; else retval = len; + spin_lock_irq(&dev->lock); + --dev->udc_usage; clean_req (dev->gadget->ep0, dev->req); /* NOTE userspace can't yet choose to stall */ } From 744f9e1da2a59881e9e5bdc5e264c47b69f56ed5 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 31 Aug 2017 14:51:40 +0200 Subject: [PATCH 256/555] usb: gadget: udc: atmel: set vbus irqflags explicitly commit 6baeda120d90aa637b08f7604de104ab00ce9126 upstream. The driver triggers actions on both edges of the vbus signal. The former PIO controller was triggering IRQs on both falling and rising edges by default. Newer PIO controller don't, so it's better to set it explicitly to IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING. Without this patch we may trigger the connection with host but only on some bouncing signal conditions and thus lose connecting events. Acked-by: Ludovic Desroches Signed-off-by: Nicolas Ferre Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/atmel_usba_udc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index a95b3e75f750..ad8402906f77 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -28,6 +28,8 @@ #include #include "atmel_usba_udc.h" +#define USBA_VBUS_IRQFLAGS (IRQF_ONESHOT \ + | IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING) #ifdef CONFIG_USB_GADGET_DEBUG_FS #include @@ -2172,7 +2174,7 @@ static int usba_udc_probe(struct platform_device *pdev) IRQ_NOAUTOEN); ret = devm_request_threaded_irq(&pdev->dev, gpio_to_irq(udc->vbus_pin), NULL, - usba_vbus_irq_thread, IRQF_ONESHOT, + usba_vbus_irq_thread, USBA_VBUS_IRQFLAGS, "atmel_usba_udc", udc); if (ret) { udc->vbus_pin = -ENODEV; From 25533678e58068487ffaa1b21b392cbfbc07bde5 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:23 +0900 Subject: [PATCH 257/555] usb: gadget: udc: renesas_usb3: fix for no-data control transfer commit 4dcf4bab4a409e81284b8202137e4a85b96b34de upstream. When bRequestType & USB_DIR_IN is false and req.length is 0 in control transfer, since it means non-data, this driver should not set the mode as control write. So, this patch fixes it. Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/renesas_usb3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index d2cfefadca3c..27499ffde698 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -991,7 +991,8 @@ static void usb3_start_pipe0(struct renesas_usb3_ep *usb3_ep, usb3_set_p0_con_for_ctrl_read_data(usb3); } else { usb3_clear_bit(usb3, P0_MOD_DIR, USB3_P0_MOD); - usb3_set_p0_con_for_ctrl_write_data(usb3); + if (usb3_req->req.length) + usb3_set_p0_con_for_ctrl_write_data(usb3); } usb3_p0_xfer(usb3_ep, usb3_req); From db73b389775a6a43ae5bea5b95ade16d5557c3e2 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:24 +0900 Subject: [PATCH 258/555] usb: gadget: udc: renesas_usb3: fix Pn_RAMMAP.Pn_MPKT value commit 73f2f5745f18b4ccfe9484deac4e84a1378d19fd upstream. According to the datasheet of R-Car Gen3, the Pn_RAMMAP.Pn_MPKT should be set to one of 8, 16, 32, 64, 512 and 1024. Otherwise, when a gadget driver uses an interrupt endpoint, unexpected behavior happens. So, this patch fixes it. Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/renesas_usb3.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 27499ffde698..6e05279e1e1e 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -1569,7 +1569,16 @@ static u32 usb3_calc_ramarea(int ram_size) static u32 usb3_calc_rammap_val(struct renesas_usb3_ep *usb3_ep, const struct usb_endpoint_descriptor *desc) { - return usb3_ep->rammap_val | PN_RAMMAP_MPKT(usb_endpoint_maxp(desc)); + int i; + const u32 max_packet_array[] = {8, 16, 32, 64, 512}; + u32 mpkt = PN_RAMMAP_MPKT(1024); + + for (i = 0; i < ARRAY_SIZE(max_packet_array); i++) { + if (usb_endpoint_maxp(desc) <= max_packet_array[i]) + mpkt = PN_RAMMAP_MPKT(max_packet_array[i]); + } + + return usb3_ep->rammap_val | mpkt; } static int usb3_enable_pipe_n(struct renesas_usb3_ep *usb3_ep, From d21653d09a0b5f93cbfdcbec5701665a3d3c9feb Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:25 +0900 Subject: [PATCH 259/555] usb: gadget: udc: renesas_usb3: Fix return value of usb3_write_pipe() commit 447b8a01b84f048d93d43bfe1fcaa4fcc56595cc upstream. This patch fixes an issue that this driver cannot go status stage in control read when the req.zero is set to 1 and the len in usb3_write_pipe() is set to 0. Otherwise, if we use g_ncm driver, usb enumeration takes long time (5 seconds or more). Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/renesas_usb3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 6e05279e1e1e..bb89e24c48b4 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -879,7 +879,7 @@ static int usb3_write_pipe(struct renesas_usb3_ep *usb3_ep, usb3_ep->ep.maxpacket); u8 *buf = usb3_req->req.buf + usb3_req->req.actual; u32 tmp = 0; - bool is_last; + bool is_last = !len ? true : false; if (usb3_wait_pipe_status(usb3_ep, PX_STA_BUFSTS) < 0) return -EBUSY; @@ -900,7 +900,8 @@ static int usb3_write_pipe(struct renesas_usb3_ep *usb3_ep, usb3_write(usb3, tmp, fifo_reg); } - is_last = usb3_is_transfer_complete(usb3_ep, usb3_req); + if (!is_last) + is_last = usb3_is_transfer_complete(usb3_ep, usb3_req); /* Send the data */ usb3_set_px_con_send(usb3_ep, len, is_last); From dd52953f6c48e19e8c6755b2b428ceace3573ef4 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 15:59:30 -0400 Subject: [PATCH 260/555] usb-storage: unusual_devs entry to fix write-access regression for Seagate external drives commit 113f6eb6d50cfa5e2a1cdcf1678b12661fa272ab upstream. Kris Lindgren reports that without the NO_WP_DETECT flag, his Seagate external disk drive fails all write accesses. This regresssion dates back approximately to the start of the 4.x kernel releases. Signed-off-by: Alan Stern Reported-by: Kris Lindgren Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 9129f6cb8230..2572fd5cd2bb 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -1459,6 +1459,13 @@ UNUSUAL_DEV( 0x0bc2, 0x3010, 0x0000, 0x0000, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_SANE_SENSE ), +/* Reported by Kris Lindgren */ +UNUSUAL_DEV( 0x0bc2, 0x3332, 0x0000, 0x9999, + "Seagate", + "External", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_WP_DETECT ), + UNUSUAL_DEV( 0x0d49, 0x7310, 0x0000, 0x9999, "Maxtor", "USB to SATA", From 760d0f10410aa41d228d4dd580d4892ca902dff0 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 16:02:05 -0400 Subject: [PATCH 261/555] usb-storage: fix bogus hardware error messages for ATA pass-thru devices commit a4fd4a724d6c30ad671046d83be2e9be2f11d275 upstream. Ever since commit a621bac3044e ("scsi_lib: correctly retry failed zero length REQ_TYPE_FS commands"), people have been getting bogus error messages for USB disk drives using ATA pass-thru. For example: [ 1344.880193] sd 6:0:0:0: [sdb] Attached SCSI disk [ 1345.069152] sd 6:0:0:0: [sdb] tag#0 FAILED Result: hostbyte=DID_ERROR driverbyte=DRIVER_SENSE [ 1345.069159] sd 6:0:0:0: [sdb] tag#0 Sense Key : Hardware Error [current] [descriptor] [ 1345.069162] sd 6:0:0:0: [sdb] tag#0 Add. Sense: No additional sense information [ 1345.069168] sd 6:0:0:0: [sdb] tag#0 CDB: ATA command pass through(16) 85 06 20 00 00 00 00 00 00 00 00 00 00 00 e5 00 [ 1345.172252] sd 6:0:0:0: [sdb] tag#0 FAILED Result: hostbyte=DID_ERROR driverbyte=DRIVER_SENSE [ 1345.172258] sd 6:0:0:0: [sdb] tag#0 Sense Key : Hardware Error [current] [descriptor] [ 1345.172261] sd 6:0:0:0: [sdb] tag#0 Add. Sense: No additional sense information [ 1345.172266] sd 6:0:0:0: [sdb] tag#0 CDB: ATA command pass through(12)/Blank a1 06 20 da 00 00 4f c2 00 b0 00 00 These messages can be quite annoying, because programs like udisks2 provoke them every 10 minutes or so. Other programs can also have this effect, such as those in smartmontools. I don't fully understand how that commit induced the SCSI core to log these error messages, but the underlying cause for them is code added to usb-storage by commit f1a0743bc0e7 ("USB: storage: When a device returns no sense data, call it a Hardware Error"). At the time it was necessary to do this, in order to prevent an infinite retry loop with some not-so-great mass storage devices. However, the ATA pass-thru protocol uses SCSI sense data to return command status values, and some devices always report Check Condition status for ATA pass-thru commands to ensure that the host retrieves the sense data, even if the command succeeded. This violates the USB mass-storage protocol (Check Condition status is supposed to mean the command failed), but we can't help that. This patch attempts to mitigate the problem of these bogus error reports by changing usb-storage. The HARDWARE ERROR sense key will be inserted only for commands that aren't ATA pass-thru. Thanks to Ewan Milne for pointing out that this mechanism was present in usb-storage. 8 years after writing it, I had completely forgotten its existence. Signed-off-by: Alan Stern Tested-by: Kris Lindgren Ref: https://bugzilla.redhat.com/show_bug.cgi?id=1351305 CC: Ewan D. Milne Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/transport.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c index 1a59f335b063..a3ccb899df60 100644 --- a/drivers/usb/storage/transport.c +++ b/drivers/usb/storage/transport.c @@ -834,13 +834,25 @@ Retry_Sense: if (result == USB_STOR_TRANSPORT_GOOD) { srb->result = SAM_STAT_GOOD; srb->sense_buffer[0] = 0x0; + } + + /* + * ATA-passthru commands use sense data to report + * the command completion status, and often devices + * return Check Condition status when nothing is + * wrong. + */ + else if (srb->cmnd[0] == ATA_16 || + srb->cmnd[0] == ATA_12) { + /* leave the data alone */ + } /* * If there was a problem, report an unspecified * hardware error to prevent the higher layers from * entering an infinite retry loop. */ - } else { + else { srb->result = DID_ERROR << 16; if ((sshdr.response_code & 0x72) == 0x72) srb->sense_buffer[1] = HARDWARE_ERROR; From 4661c9b526c3801827c9456a635671fd98c8e7ed Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 27 Sep 2017 18:47:12 +0900 Subject: [PATCH 262/555] usb: renesas_usbhs: fix the BCLR setting condition for non-DCP pipe commit 6124607acc88fffeaadf3aacfeb3cc1304c87387 upstream. This patch fixes an issue that the driver sets the BCLR bit of {C,Dn}FIFOCTR register to 1 even when it's non-DCP pipe and the FRDY bit of {C,Dn}FIFOCTR register is set to 1. Fixes: e8d548d54968 ("usb: renesas_usbhs: fifo became independent from pipe.") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/fifo.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index 857e78337324..af9dea022cdf 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -285,11 +285,17 @@ static void usbhsf_fifo_clear(struct usbhs_pipe *pipe, struct usbhs_fifo *fifo) { struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); + int ret = 0; if (!usbhs_pipe_is_dcp(pipe)) - usbhsf_fifo_barrier(priv, fifo); + ret = usbhsf_fifo_barrier(priv, fifo); - usbhs_write(priv, fifo->ctr, BCLR); + /* + * if non-DCP pipe, this driver should set BCLR when + * usbhsf_fifo_barrier() returns 0. + */ + if (!ret) + usbhs_write(priv, fifo->ctr, BCLR); } static int usbhsf_fifo_rcv_len(struct usbhs_priv *priv, From eb5df140ca299b3492327ba015b1f231399a1b1f Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 27 Sep 2017 18:47:13 +0900 Subject: [PATCH 263/555] usb: renesas_usbhs: fix usbhsf_fifo_clear() for RX direction commit 0a2ce62b61f2c76d0213edf4e37aaf54a8ddf295 upstream. This patch fixes an issue that the usbhsf_fifo_clear() is possible to cause 10 msec delay if the pipe is RX direction and empty because the FRDY bit will never be set to 1 in such case. Fixes: e8d548d54968 ("usb: renesas_usbhs: fifo became independent from pipe.") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/fifo.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index af9dea022cdf..8897195396b2 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -287,8 +287,17 @@ static void usbhsf_fifo_clear(struct usbhs_pipe *pipe, struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); int ret = 0; - if (!usbhs_pipe_is_dcp(pipe)) - ret = usbhsf_fifo_barrier(priv, fifo); + if (!usbhs_pipe_is_dcp(pipe)) { + /* + * This driver checks the pipe condition first to avoid -EBUSY + * from usbhsf_fifo_barrier() with about 10 msec delay in + * the interrupt handler if the pipe is RX direction and empty. + */ + if (usbhs_pipe_is_dir_in(pipe)) + ret = usbhs_pipe_is_accessible(pipe); + if (!ret) + ret = usbhsf_fifo_barrier(priv, fifo); + } /* * if non-DCP pipe, this driver should set BCLR when From 37b6d898388e78d92a13a8ab50c960d507c968d1 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 22 Sep 2017 16:18:53 +0200 Subject: [PATCH 264/555] ALSA: usb-audio: Check out-of-bounds access by corrupted buffer descriptor commit bfc81a8bc18e3c4ba0cbaa7666ff76be2f998991 upstream. When a USB-audio device receives a maliciously adjusted or corrupted buffer descriptor, the USB-audio driver may access an out-of-bounce value at its parser. This was detected by syzkaller, something like: BUG: KASAN: slab-out-of-bounds in usb_audio_probe+0x27b2/0x2ab0 Read of size 1 at addr ffff88006b83a9e8 by task kworker/0:1/24 CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.14.0-rc1-42251-gebb2c2437d80 #224 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 print_address_description+0x78/0x280 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 kasan_report+0x22f/0x340 mm/kasan/report.c:409 __asan_report_load1_noabort+0x19/0x20 mm/kasan/report.c:427 snd_usb_create_streams sound/usb/card.c:248 usb_audio_probe+0x27b2/0x2ab0 sound/usb/card.c:605 usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_set_configuration+0x104e/0x1870 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4903 hub_port_connect_change drivers/usb/core/hub.c:5009 port_event drivers/usb/core/hub.c:5115 hub_event+0x194d/0x3740 drivers/usb/core/hub.c:5195 process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119 worker_thread+0x221/0x1850 kernel/workqueue.c:2253 kthread+0x3a1/0x470 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 This patch adds the checks of out-of-bounce accesses at appropriate places and bails out when it goes out of the given buffer. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/card.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sound/usb/card.c b/sound/usb/card.c index f36cb068dad3..8906199a83e6 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -221,6 +221,7 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) struct usb_interface_descriptor *altsd; void *control_header; int i, protocol; + int rest_bytes; /* find audiocontrol interface */ host_iface = &usb_ifnum_to_if(dev, ctrlif)->altsetting[0]; @@ -235,6 +236,15 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) return -EINVAL; } + rest_bytes = (void *)(host_iface->extra + host_iface->extralen) - + control_header; + + /* just to be sure -- this shouldn't hit at all */ + if (rest_bytes <= 0) { + dev_err(&dev->dev, "invalid control header\n"); + return -EINVAL; + } + switch (protocol) { default: dev_warn(&dev->dev, @@ -245,11 +255,21 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) case UAC_VERSION_1: { struct uac1_ac_header_descriptor *h1 = control_header; + if (rest_bytes < sizeof(*h1)) { + dev_err(&dev->dev, "too short v1 buffer descriptor\n"); + return -EINVAL; + } + if (!h1->bInCollection) { dev_info(&dev->dev, "skipping empty audio interface (v1)\n"); return -EINVAL; } + if (rest_bytes < h1->bLength) { + dev_err(&dev->dev, "invalid buffer length (v1)\n"); + return -EINVAL; + } + if (h1->bLength < sizeof(*h1) + h1->bInCollection) { dev_err(&dev->dev, "invalid UAC_HEADER (v1)\n"); return -EINVAL; From 0b104f92ed217a2e47a832928f23dd807eeebcd6 Mon Sep 17 00:00:00 2001 From: Jim Dickerson Date: Mon, 18 Sep 2017 17:39:14 +0300 Subject: [PATCH 265/555] usb: pci-quirks.c: Corrected timeout values used in handshake commit 114ec3a6f9096d211a4aff4277793ba969a62c73 upstream. Servers were emitting failed handoff messages but were not waiting the full 1 second as designated in section 4.22.1 of the eXtensible Host Controller Interface specifications. The handshake was using wrong units so calls were made with milliseconds not microseconds. Comments referenced 5 seconds not 1 second as in specs. The wrong units were also corrected in a second handshake call. Signed-off-by: Jim Dickerson Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 58b9685eb21f..31347a6ac146 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -1022,7 +1022,7 @@ EXPORT_SYMBOL_GPL(usb_disable_xhci_ports); * * Takes care of the handoff between the Pre-OS (i.e. BIOS) and the OS. * It signals to the BIOS that the OS wants control of the host controller, - * and then waits 5 seconds for the BIOS to hand over control. + * and then waits 1 second for the BIOS to hand over control. * If we timeout, assume the BIOS is broken and take control anyway. */ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) @@ -1069,9 +1069,9 @@ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) if (val & XHCI_HC_BIOS_OWNED) { writel(val | XHCI_HC_OS_OWNED, base + ext_cap_offset); - /* Wait for 5 seconds with 10 microsecond polling interval */ + /* Wait for 1 second with 10 microsecond polling interval */ timeout = handshake(base + ext_cap_offset, XHCI_HC_BIOS_OWNED, - 0, 5000, 10); + 0, 1000000, 10); /* Assume a buggy BIOS and take HC ownership anyway */ if (timeout) { @@ -1100,7 +1100,7 @@ hc_init: * operational or runtime registers. Wait 5 seconds and no more. */ timeout = handshake(op_reg_base + XHCI_STS_OFFSET, XHCI_STS_CNR, 0, - 5000, 10); + 5000000, 10); /* Assume a buggy HC and start HC initialization anyway */ if (timeout) { val = readl(op_reg_base + XHCI_STS_OFFSET); From 12071de6c37d04803531638adf32d270410b7cb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 22 Sep 2017 22:18:18 +0200 Subject: [PATCH 266/555] USB: cdc-wdm: ignore -EPIPE from GetEncapsulatedResponse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8fec9355a968ad240f3a2e9ad55b823cf1cc52ff upstream. The driver will forward errors to userspace after turning most of them into -EIO. But all status codes are not equal. The -EPIPE (stall) in particular can be seen more as a result of normal USB signaling than an actual error. The state is automatically cleared by the USB core without intervention from either driver or userspace. And most devices and firmwares will never trigger a stall as a result of GetEncapsulatedResponse. This is in fact a requirement for CDC WDM devices. Quoting from section 7.1 of the CDC WMC spec revision 1.1: The function shall not return STALL in response to GetEncapsulatedResponse. But this driver is also handling GetEncapsulatedResponse on behalf of the qmi_wwan and cdc_mbim drivers. Unfortunately the relevant specs are not as clear wrt stall. So some QMI and MBIM devices *will* occasionally stall, causing the GetEncapsulatedResponse to return an -EPIPE status. Translating this into -EIO for userspace has proven to be harmful. Treating it as an empty read is safer, making the driver behave as if the device was conforming to the CDC WDM spec. There have been numerous reports of issues related to -EPIPE errors from some newer CDC MBIM devices in particular, like for example the Fibocom L831-EAU. Testing on this device has shown that the issues go away if we simply ignore the -EPIPE status. Similar handling of -EPIPE is already known from e.g. usb_get_string() The -EPIPE log message is still kept to let us track devices with this unexpected behaviour, hoping that it attracts attention from firmware developers. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100938 Reported-and-tested-by: Christian Ehrig Reported-and-tested-by: Patrick Chilton Reported-and-tested-by: Andreas Böhler Signed-off-by: Bjørn Mork Acked-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 0b845e550fbd..9f001659807a 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -194,8 +194,10 @@ static void wdm_in_callback(struct urb *urb) /* * only set a new error if there is no previous error. * Errors are only cleared during read/open + * Avoid propagating -EPIPE (stall) to userspace since it is + * better handled as an empty read */ - if (desc->rerr == 0) + if (desc->rerr == 0 && status != -EPIPE) desc->rerr = status; if (length + desc->length > desc->wMaxCommand) { From 5effe995310e8c291769661600ec1488a35b66e2 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:22 -0400 Subject: [PATCH 267/555] USB: dummy-hcd: fix connection failures (wrong speed) commit fe659bcc9b173bcfdd958ce2aec75e47651e74e1 upstream. The dummy-hcd UDC driver is not careful about the way it handles connection speeds. It ignores the module parameter that is supposed to govern the maximum connection speed and it doesn't set the HCD flags properly for the case where it ends up running at full speed. The result is that in many cases, gadget enumeration over dummy-hcd fails because the bMaxPacketSize byte in the device descriptor is set incorrectly. For example, the default settings call for a high-speed connection, but the maxpacket value for ep0 ends up being set for a Super-Speed connection. This patch fixes the problem by initializing the gadget's max_speed and the HCD flags correctly. Signed-off-by: Alan Stern Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 94c8a9f6cbf1..64fc2707c559 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -1030,7 +1030,12 @@ static int dummy_udc_probe(struct platform_device *pdev) memzero_explicit(&dum->gadget, sizeof(struct usb_gadget)); dum->gadget.name = gadget_name; dum->gadget.ops = &dummy_ops; - dum->gadget.max_speed = USB_SPEED_SUPER; + if (mod_data.is_super_speed) + dum->gadget.max_speed = USB_SPEED_SUPER; + else if (mod_data.is_high_speed) + dum->gadget.max_speed = USB_SPEED_HIGH; + else + dum->gadget.max_speed = USB_SPEED_FULL; dum->gadget.dev.parent = &pdev->dev; init_dummy_udc_hw(dum); @@ -2559,8 +2564,6 @@ static struct hc_driver dummy_hcd = { .product_desc = "Dummy host controller", .hcd_priv_size = sizeof(struct dummy_hcd), - .flags = HCD_USB3 | HCD_SHARED, - .reset = dummy_setup, .start = dummy_start, .stop = dummy_stop, @@ -2589,8 +2592,12 @@ static int dummy_hcd_probe(struct platform_device *pdev) dev_info(&pdev->dev, "%s, driver " DRIVER_VERSION "\n", driver_desc); dum = *((void **)dev_get_platdata(&pdev->dev)); - if (!mod_data.is_super_speed) + if (mod_data.is_super_speed) + dummy_hcd.flags = HCD_USB3 | HCD_SHARED; + else if (mod_data.is_high_speed) dummy_hcd.flags = HCD_USB2; + else + dummy_hcd.flags = HCD_USB11; hs_hcd = usb_create_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev)); if (!hs_hcd) return -ENOMEM; From 795f5501b95cf1898d9fc831fef7d451d7cfea8f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:40 -0400 Subject: [PATCH 268/555] USB: dummy-hcd: fix infinite-loop resubmission bug commit 0173a68bfb0ad1c72a6ee39cc485aa2c97540b98 upstream. The dummy-hcd HCD/UDC emulator tries not to do too much work during each timer interrupt. But it doesn't try very hard; currently all it does is limit the total amount of bulk data transferred. Other transfer types aren't limited, and URBs that transfer no data (because of an error, perhaps) don't count toward the limit, even though on a real USB bus they would consume at least a minimum overhead. This means it's possible to get the driver stuck in an infinite loop, for example, if the host class driver resubmits an URB every time it completes (which is common for interrupt URBs). Each time the URB is resubmitted it gets added to the end of the pending-URBs list, and dummy-hcd doesn't stop until that list is empty. Andrey Konovalov was able to trigger this failure mode using the syzkaller fuzzer. This patch fixes the infinite-loop problem by restricting the URBs handled during each timer interrupt to those that were already on the pending list when the interrupt routine started. Newly added URBs won't be processed until the next timer interrupt. The problem of properly accounting for non-bulk bandwidth (as well as packet and transaction overhead) is not addressed here. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 64fc2707c559..da7828a86d99 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -237,6 +237,8 @@ struct dummy_hcd { struct usb_device *udev; struct list_head urbp_list; + struct urbp *next_frame_urbp; + u32 stream_en_ep; u8 num_stream[30 / 2]; @@ -1244,6 +1246,8 @@ static int dummy_urb_enqueue( list_add_tail(&urbp->urbp_list, &dum_hcd->urbp_list); urb->hcpriv = urbp; + if (!dum_hcd->next_frame_urbp) + dum_hcd->next_frame_urbp = urbp; if (usb_pipetype(urb->pipe) == PIPE_CONTROL) urb->error_count = 1; /* mark as a new urb */ @@ -1761,6 +1765,7 @@ static void dummy_timer(unsigned long _dum_hcd) spin_unlock_irqrestore(&dum->lock, flags); return; } + dum_hcd->next_frame_urbp = NULL; for (i = 0; i < DUMMY_ENDPOINTS; i++) { if (!ep_info[i].name) @@ -1777,6 +1782,10 @@ restart: int type; int status = -EINPROGRESS; + /* stop when we reach URBs queued after the timer interrupt */ + if (urbp == dum_hcd->next_frame_urbp) + break; + urb = urbp->urb; if (urb->unlinked) goto return_urb; From e39b17143a5b5aac81f066d455e5d3a9877eb3ae Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:49 -0400 Subject: [PATCH 269/555] USB: dummy-hcd: Fix erroneous synchronization change commit 7dbd8f4cabd96db5a50513de9d83a8105a5ffc81 upstream. A recent change to the synchronization in dummy-hcd was incorrect. The issue was that dummy_udc_stop() contained no locking and therefore could race with various gadget driver callbacks, and the fix was to add locking and issue the callbacks with the private spinlock held. UDC drivers aren't supposed to do this. Gadget driver callback routines are allowed to invoke functions in the UDC driver, and these functions will generally try to acquire the private spinlock. This would deadlock the driver. The correct solution is to drop the spinlock before issuing callbacks, and avoid races by emulating the synchronize_irq() call that all real UDC drivers must perform in their ->udc_stop() routines after disabling interrupts. This involves adding a flag to dummy-hcd's private structure to keep track of whether interrupts are supposed to be enabled, and adding a counter to keep track of ongoing callbacks so that dummy_udc_stop() can wait for them all to finish. A real UDC driver won't receive disconnect, reset, suspend, resume, or setup events once it has disabled interrupts. dummy-hcd will receive them but won't try to issue any gadget driver callbacks, which should be just as good. Signed-off-by: Alan Stern Fixes: f16443a034c7 ("USB: gadgetfs, dummy-hcd, net2280: fix locking for callbacks") Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 32 ++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index da7828a86d99..fb17fb23fa9a 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -255,11 +255,13 @@ struct dummy { */ struct dummy_ep ep[DUMMY_ENDPOINTS]; int address; + int callback_usage; struct usb_gadget gadget; struct usb_gadget_driver *driver; struct dummy_request fifo_req; u8 fifo_buf[FIFO_SIZE]; u16 devstatus; + unsigned ints_enabled:1; unsigned udc_suspended:1; unsigned pullup:1; @@ -442,18 +444,27 @@ static void set_link_state(struct dummy_hcd *dum_hcd) (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ - if (dum->driver && (disconnect || reset)) { + if (dum->ints_enabled && (disconnect || reset)) { stop_activity(dum); + ++dum->callback_usage; + spin_unlock(&dum->lock); if (reset) usb_gadget_udc_reset(&dum->gadget, dum->driver); else dum->driver->disconnect(&dum->gadget); + spin_lock(&dum->lock); + --dum->callback_usage; } - } else if (dum_hcd->active != dum_hcd->old_active) { + } else if (dum_hcd->active != dum_hcd->old_active && + dum->ints_enabled) { + ++dum->callback_usage; + spin_unlock(&dum->lock); if (dum_hcd->old_active && dum->driver->suspend) dum->driver->suspend(&dum->gadget); else if (!dum_hcd->old_active && dum->driver->resume) dum->driver->resume(&dum->gadget); + spin_lock(&dum->lock); + --dum->callback_usage; } dum_hcd->old_status = dum_hcd->port_status; @@ -967,8 +978,11 @@ static int dummy_udc_start(struct usb_gadget *g, * can't enumerate without help from the driver we're binding. */ + spin_lock_irq(&dum->lock); dum->devstatus = 0; dum->driver = driver; + dum->ints_enabled = 1; + spin_unlock_irq(&dum->lock); return 0; } @@ -979,6 +993,16 @@ static int dummy_udc_stop(struct usb_gadget *g) struct dummy *dum = dum_hcd->dum; spin_lock_irq(&dum->lock); + dum->ints_enabled = 0; + stop_activity(dum); + + /* emulate synchronize_irq(): wait for callbacks to finish */ + while (dum->callback_usage > 0) { + spin_unlock_irq(&dum->lock); + usleep_range(1000, 2000); + spin_lock_irq(&dum->lock); + } + dum->driver = NULL; spin_unlock_irq(&dum->lock); @@ -1524,6 +1548,8 @@ static struct dummy_ep *find_endpoint(struct dummy *dum, u8 address) if (!is_active((dum->gadget.speed == USB_SPEED_SUPER ? dum->ss_hcd : dum->hs_hcd))) return NULL; + if (!dum->ints_enabled) + return NULL; if ((address & ~USB_DIR_IN) == 0) return &dum->ep[0]; for (i = 1; i < DUMMY_ENDPOINTS; i++) { @@ -1865,10 +1891,12 @@ restart: * until setup() returns; no reentrancy issues etc. */ if (value > 0) { + ++dum->callback_usage; spin_unlock(&dum->lock); value = dum->driver->setup(&dum->gadget, &setup); spin_lock(&dum->lock); + --dum->callback_usage; if (value >= 0) { /* no delays (max 64KB data stage) */ From 77a4be89599c587f8b1ac6256ad7dc11a8ce28fa Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 22 Sep 2017 23:43:46 +0300 Subject: [PATCH 270/555] USB: devio: Don't corrupt user memory commit fa1ed74eb1c233be6131ec92df21ab46499a15b6 upstream. The user buffer has "uurb->buffer_length" bytes. If the kernel has more information than that, we should truncate it instead of writing past the end of the user's buffer. I added a WARN_ONCE() to help the user debug the issue. Reported-by: Alan Stern Signed-off-by: Dan Carpenter Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index c8075eb3db26..860108c46e9a 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1577,7 +1577,11 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb totlen += isopkt[u].length; } u *= sizeof(struct usb_iso_packet_descriptor); - uurb->buffer_length = totlen; + if (totlen <= uurb->buffer_length) + uurb->buffer_length = totlen; + else + WARN_ONCE(1, "uurb->buffer_length is too short %d vs %d", + totlen, uurb->buffer_length); break; default: From 2b5c7b95ea36521e7cb4f93be546a8be5fd9d667 Mon Sep 17 00:00:00 2001 From: Li Jun Date: Fri, 14 Apr 2017 19:12:07 +0800 Subject: [PATCH 271/555] usb: gadget: mass_storage: set msg_registered after msg registered commit 8e55d30322c6a0ef746c256a1beda9c73ecb27a6 upstream. If there is no UDC available, the msg register will fail and this flag will not be set, but the driver is already added into pending driver list, then the module removal modprobe -r can not remove the driver from the pending list. Signed-off-by: Li Jun Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/mass_storage.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/legacy/mass_storage.c b/drivers/usb/gadget/legacy/mass_storage.c index 125974f32f50..e99ab57ee3e5 100644 --- a/drivers/usb/gadget/legacy/mass_storage.c +++ b/drivers/usb/gadget/legacy/mass_storage.c @@ -210,7 +210,6 @@ static int msg_bind(struct usb_composite_dev *cdev) usb_composite_overwrite_options(cdev, &coverwrite); dev_info(&cdev->gadget->dev, DRIVER_DESC ", version: " DRIVER_VERSION "\n"); - set_bit(0, &msg_registered); return 0; fail_otg_desc: @@ -257,7 +256,12 @@ MODULE_LICENSE("GPL"); static int __init msg_init(void) { - return usb_composite_probe(&msg_driver); + int ret; + + ret = usb_composite_probe(&msg_driver); + set_bit(0, &msg_registered); + + return ret; } module_init(msg_init); From da785bb64fa661f5081b5c0351b798396b6071a6 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 13:22:00 -0400 Subject: [PATCH 272/555] USB: g_mass_storage: Fix deadlock when driver is unbound commit 1fbbb78f25d1291274f320462bf6908906f538db upstream. As a holdover from the old g_file_storage gadget, the g_mass_storage legacy gadget driver attempts to unregister itself when its main operating thread terminates (if it hasn't been unregistered already). This is not strictly necessary; it was never more than an attempt to have the gadget fail cleanly if something went wrong and the main thread was killed. However, now that the UDC core manages gadget drivers independently of UDC drivers, this scheme doesn't work any more. A simple test: modprobe dummy-hcd modprobe g-mass-storage file=... rmmod dummy-hcd ends up in a deadlock with the following backtrace: sysrq: SysRq : Show Blocked State task PC stack pid father file-storage D 0 1130 2 0x00000000 Call Trace: __schedule+0x53e/0x58c schedule+0x6e/0x77 schedule_preempt_disabled+0xd/0xf __mutex_lock.isra.1+0x129/0x224 ? _raw_spin_unlock_irqrestore+0x12/0x14 __mutex_lock_slowpath+0x12/0x14 mutex_lock+0x28/0x2b usb_gadget_unregister_driver+0x29/0x9b [udc_core] usb_composite_unregister+0x10/0x12 [libcomposite] msg_cleanup+0x1d/0x20 [g_mass_storage] msg_thread_exits+0xd/0xdd7 [g_mass_storage] fsg_main_thread+0x1395/0x13d6 [usb_f_mass_storage] ? __schedule+0x573/0x58c kthread+0xd9/0xdb ? do_set_interface+0x25c/0x25c [usb_f_mass_storage] ? init_completion+0x1e/0x1e ret_from_fork+0x19/0x24 rmmod D 0 1155 683 0x00000000 Call Trace: __schedule+0x53e/0x58c schedule+0x6e/0x77 schedule_timeout+0x26/0xbc ? __schedule+0x573/0x58c do_wait_for_common+0xb3/0x128 ? usleep_range+0x81/0x81 ? wake_up_q+0x3f/0x3f wait_for_common+0x2e/0x45 wait_for_completion+0x17/0x19 fsg_common_put+0x34/0x81 [usb_f_mass_storage] fsg_free_inst+0x13/0x1e [usb_f_mass_storage] usb_put_function_instance+0x1a/0x25 [libcomposite] msg_unbind+0x2a/0x42 [g_mass_storage] __composite_unbind+0x4a/0x6f [libcomposite] composite_unbind+0x12/0x14 [libcomposite] usb_gadget_remove_driver+0x4f/0x77 [udc_core] usb_del_gadget_udc+0x52/0xcc [udc_core] dummy_udc_remove+0x27/0x2c [dummy_hcd] platform_drv_remove+0x1d/0x31 device_release_driver_internal+0xe9/0x16d device_release_driver+0x11/0x13 bus_remove_device+0xd2/0xe2 device_del+0x19f/0x221 ? selinux_capable+0x22/0x27 platform_device_del+0x21/0x63 platform_device_unregister+0x10/0x1a cleanup+0x20/0x817 [dummy_hcd] SyS_delete_module+0x10c/0x197 ? ____fput+0xd/0xf ? task_work_run+0x55/0x62 ? prepare_exit_to_usermode+0x65/0x75 do_fast_syscall_32+0x86/0xc3 entry_SYSENTER_32+0x4e/0x7c What happens is that removing the dummy-hcd driver causes the UDC core to unbind the gadget driver, which it does while holding the udc_lock mutex. The unbind routine in g_mass_storage tells the main thread to exit and waits for it to terminate. But as mentioned above, when the main thread exits it tries to unregister the mass-storage function driver. Via the composite framework this ends up calling usb_gadget_unregister_driver(), which tries to acquire the udc_lock mutex. The result is deadlock. The simplest way to fix the problem is not to be so clever: The main thread doesn't have to unregister the function driver. The side effects won't be so terrible; if the gadget is still attached to a USB host when the main thread is killed, it will appear to the host as though the gadget's firmware has crashed -- a reasonably accurate interpretation, and an all-too-common occurrence for USB mass-storage devices. In fact, the code to unregister the driver when the main thread exits is specific to g-mass-storage; it is not used when f-mass-storage is included as a function in a larger composite device. Therefore the entire mechanism responsible for this (the fsg_operations structure with its ->thread_exits method, the fsg_common_set_ops() routine, and the msg_thread_exits() callback routine) can all be eliminated. Even the msg_registered bitflag can be removed, because now the driver is unregistered in only one place rather than in two places. Signed-off-by: Alan Stern Acked-by: Felipe Balbi Acked-by: Michal Nazarewicz Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_mass_storage.c | 27 +++++--------------- drivers/usb/gadget/function/f_mass_storage.h | 14 ---------- drivers/usb/gadget/legacy/mass_storage.c | 26 +++---------------- 3 files changed, 10 insertions(+), 57 deletions(-) diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index ccd93c9e26ab..d2fc237cd87a 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -306,8 +306,6 @@ struct fsg_common { struct completion thread_notifier; struct task_struct *thread_task; - /* Callback functions. */ - const struct fsg_operations *ops; /* Gadget's private data. */ void *private_data; @@ -2505,6 +2503,7 @@ static void handle_exception(struct fsg_common *common) static int fsg_main_thread(void *common_) { struct fsg_common *common = common_; + int i; /* * Allow the thread to be killed by a signal, but set the signal mask @@ -2566,21 +2565,16 @@ static int fsg_main_thread(void *common_) common->thread_task = NULL; spin_unlock_irq(&common->lock); - if (!common->ops || !common->ops->thread_exits - || common->ops->thread_exits(common) < 0) { - int i; + /* Eject media from all LUNs */ - down_write(&common->filesem); - for (i = 0; i < ARRAY_SIZE(common->luns); --i) { - struct fsg_lun *curlun = common->luns[i]; - if (!curlun || !fsg_lun_is_open(curlun)) - continue; + down_write(&common->filesem); + for (i = 0; i < ARRAY_SIZE(common->luns); i++) { + struct fsg_lun *curlun = common->luns[i]; + if (curlun && fsg_lun_is_open(curlun)) fsg_lun_close(curlun); - curlun->unit_attention_data = SS_MEDIUM_NOT_PRESENT; - } - up_write(&common->filesem); } + up_write(&common->filesem); /* Let fsg_unbind() know the thread has exited */ complete_and_exit(&common->thread_notifier, 0); @@ -2770,13 +2764,6 @@ void fsg_common_remove_luns(struct fsg_common *common) } EXPORT_SYMBOL_GPL(fsg_common_remove_luns); -void fsg_common_set_ops(struct fsg_common *common, - const struct fsg_operations *ops) -{ - common->ops = ops; -} -EXPORT_SYMBOL_GPL(fsg_common_set_ops); - void fsg_common_free_buffers(struct fsg_common *common) { _fsg_common_free_buffers(common->buffhds, common->fsg_num_buffers); diff --git a/drivers/usb/gadget/function/f_mass_storage.h b/drivers/usb/gadget/function/f_mass_storage.h index d3902313b8ac..dc05ca0c4359 100644 --- a/drivers/usb/gadget/function/f_mass_storage.h +++ b/drivers/usb/gadget/function/f_mass_storage.h @@ -60,17 +60,6 @@ struct fsg_module_parameters { struct fsg_common; /* FSF callback functions */ -struct fsg_operations { - /* - * Callback function to call when thread exits. If no - * callback is set or it returns value lower then zero MSF - * will force eject all LUNs it operates on (including those - * marked as non-removable or with prevent_medium_removal flag - * set). - */ - int (*thread_exits)(struct fsg_common *common); -}; - struct fsg_lun_opts { struct config_group group; struct fsg_lun *lun; @@ -142,9 +131,6 @@ void fsg_common_remove_lun(struct fsg_lun *lun); void fsg_common_remove_luns(struct fsg_common *common); -void fsg_common_set_ops(struct fsg_common *common, - const struct fsg_operations *ops); - int fsg_common_create_lun(struct fsg_common *common, struct fsg_lun_config *cfg, unsigned int id, const char *name, const char **name_pfx); diff --git a/drivers/usb/gadget/legacy/mass_storage.c b/drivers/usb/gadget/legacy/mass_storage.c index e99ab57ee3e5..fcba59782f26 100644 --- a/drivers/usb/gadget/legacy/mass_storage.c +++ b/drivers/usb/gadget/legacy/mass_storage.c @@ -107,15 +107,6 @@ static unsigned int fsg_num_buffers = CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS; FSG_MODULE_PARAMETERS(/* no prefix */, mod_data); -static unsigned long msg_registered; -static void msg_cleanup(void); - -static int msg_thread_exits(struct fsg_common *common) -{ - msg_cleanup(); - return 0; -} - static int msg_do_config(struct usb_configuration *c) { struct fsg_opts *opts; @@ -154,9 +145,6 @@ static struct usb_configuration msg_config_driver = { static int msg_bind(struct usb_composite_dev *cdev) { - static const struct fsg_operations ops = { - .thread_exits = msg_thread_exits, - }; struct fsg_opts *opts; struct fsg_config config; int status; @@ -173,8 +161,6 @@ static int msg_bind(struct usb_composite_dev *cdev) if (status) goto fail; - fsg_common_set_ops(opts->common, &ops); - status = fsg_common_set_cdev(opts->common, cdev, config.can_stall); if (status) goto fail_set_cdev; @@ -256,18 +242,12 @@ MODULE_LICENSE("GPL"); static int __init msg_init(void) { - int ret; - - ret = usb_composite_probe(&msg_driver); - set_bit(0, &msg_registered); - - return ret; + return usb_composite_probe(&msg_driver); } module_init(msg_init); -static void msg_cleanup(void) +static void __exit msg_cleanup(void) { - if (test_and_clear_bit(0, &msg_registered)) - usb_composite_unregister(&msg_driver); + usb_composite_unregister(&msg_driver); } module_exit(msg_cleanup); From d77606e93d819ad4b8f57511ff61a629ced49750 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 22 Sep 2017 11:56:49 -0400 Subject: [PATCH 273/555] USB: uas: fix bug in handling of alternate settings commit 786de92b3cb26012d3d0f00ee37adf14527f35c4 upstream. The uas driver has a subtle bug in the way it handles alternate settings. The uas_find_uas_alt_setting() routine returns an altsetting value (the bAlternateSetting number in the descriptor), but uas_use_uas_driver() then treats that value as an index to the intf->altsetting array, which it isn't. Normally this doesn't cause any problems because the various alternate settings have bAlternateSetting values 0, 1, 2, ..., so the value is equal to the index in the array. But this is not guaranteed, and Andrey Konovalov used the syzkaller fuzzer with KASAN to get a slab-out-of-bounds error by violating this assumption. This patch fixes the bug by making uas_find_uas_alt_setting() return a pointer to the altsetting entry rather than either the value or the index. Pointers are less subject to misinterpretation. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov CC: Oliver Neukum Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/uas-detect.h | 15 ++++++++------- drivers/usb/storage/uas.c | 10 +++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/usb/storage/uas-detect.h b/drivers/usb/storage/uas-detect.h index f58caa9e6a27..a155cd02bce2 100644 --- a/drivers/usb/storage/uas-detect.h +++ b/drivers/usb/storage/uas-detect.h @@ -9,7 +9,8 @@ static int uas_is_interface(struct usb_host_interface *intf) intf->desc.bInterfaceProtocol == USB_PR_UAS); } -static int uas_find_uas_alt_setting(struct usb_interface *intf) +static struct usb_host_interface *uas_find_uas_alt_setting( + struct usb_interface *intf) { int i; @@ -17,10 +18,10 @@ static int uas_find_uas_alt_setting(struct usb_interface *intf) struct usb_host_interface *alt = &intf->altsetting[i]; if (uas_is_interface(alt)) - return alt->desc.bAlternateSetting; + return alt; } - return -ENODEV; + return NULL; } static int uas_find_endpoints(struct usb_host_interface *alt, @@ -58,14 +59,14 @@ static int uas_use_uas_driver(struct usb_interface *intf, struct usb_device *udev = interface_to_usbdev(intf); struct usb_hcd *hcd = bus_to_hcd(udev->bus); unsigned long flags = id->driver_info; - int r, alt; - + struct usb_host_interface *alt; + int r; alt = uas_find_uas_alt_setting(intf); - if (alt < 0) + if (!alt) return 0; - r = uas_find_endpoints(&intf->altsetting[alt], eps); + r = uas_find_endpoints(alt, eps); if (r < 0) return 0; diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index 5ef014ba6ae8..9876af4ab64e 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -873,14 +873,14 @@ MODULE_DEVICE_TABLE(usb, uas_usb_ids); static int uas_switch_interface(struct usb_device *udev, struct usb_interface *intf) { - int alt; + struct usb_host_interface *alt; alt = uas_find_uas_alt_setting(intf); - if (alt < 0) - return alt; + if (!alt) + return -ENODEV; - return usb_set_interface(udev, - intf->altsetting[0].desc.bInterfaceNumber, alt); + return usb_set_interface(udev, alt->desc.bInterfaceNumber, + alt->desc.bAlternateSetting); } static int uas_configure_endpoints(struct uas_dev_info *devinfo) From 767f7a2cf33a135fe3f57010b51c3f6e92d7677d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 21 Sep 2017 16:58:48 +0200 Subject: [PATCH 274/555] USB: core: harden cdc_parse_cdc_header commit 2e1c42391ff2556387b3cb6308b24f6f65619feb upstream. Andrey Konovalov reported a possible out-of-bounds problem for the cdc_parse_cdc_header function. He writes: It looks like cdc_parse_cdc_header() doesn't validate buflen before accessing buffer[1], buffer[2] and so on. The only check present is while (buflen > 0). So fix this issue up by properly validating the buffer length matches what the descriptor says it is. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 3a4707746157..4c388451f31f 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -2068,6 +2068,10 @@ int cdc_parse_cdc_header(struct usb_cdc_parsed_header *hdr, elength = 1; goto next_desc; } + if ((buflen < elength) || (elength < 3)) { + dev_err(&intf->dev, "invalid descriptor buffer length\n"); + break; + } if (buffer[1] != USB_DT_CS_INTERFACE) { dev_err(&intf->dev, "skipping garbage\n"); goto next_desc; From 43feb29db4c5233fd77eedd57b4f2a53c9db9eff Mon Sep 17 00:00:00 2001 From: Dmitry Fleytman Date: Tue, 5 Sep 2017 11:40:56 +0300 Subject: [PATCH 275/555] usb: Increase quirk delay for USB devices commit b2a542bbb3081dbd64acc8929c140d196664c406 upstream. Commit e0429362ab15 ("usb: Add device quirk for Logitech HD Pro Webcams C920 and C930e") introduced quirk to workaround an issue with some Logitech webcams. The workaround is introducing delay for some USB operations. According to our testing, delay introduced by original commit is not long enough and in rare cases we still see issues described by the aforementioned commit. This patch increases delays introduced by original commit. Having this patch applied we do not see those problems anymore. Signed-off-by: Dmitry Fleytman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 2 +- drivers/usb/core/hub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index eef716bdc259..5dc08c71c5b2 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -847,7 +847,7 @@ int usb_get_configuration(struct usb_device *dev) } if (dev->quirks & USB_QUIRK_DELAY_INIT) - msleep(100); + msleep(200); result = usb_get_descriptor(dev, USB_DT_CONFIG, cfgno, bigbuffer, length); diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 80d4ef31ba8d..8127f112958e 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -4828,7 +4828,7 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, goto loop; if (udev->quirks & USB_QUIRK_DELAY_INIT) - msleep(1000); + msleep(2000); /* consecutive bus-powered hubs aren't reliable; they can * violate the voltage drop budget. if the new child has From a6d4ce2e8b653ff7facde0d0051663fa4cf57b78 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Sep 2017 15:07:17 +0200 Subject: [PATCH 276/555] USB: fix out-of-bounds in usb_set_configuration commit bd7a3fe770ebd8391d1c7d072ff88e9e76d063eb upstream. Andrey Konovalov reported a possible out-of-bounds problem for a USB interface association descriptor. He writes: It seems there's no proper size check of a USB_DT_INTERFACE_ASSOCIATION descriptor. It's only checked that the size is >= 2 in usb_parse_configuration(), so find_iad() might do out-of-bounds access to intf_assoc->bInterfaceCount. And he's right, we don't check for crazy descriptors of this type very well, so resolve this problem. Yet another issue found by syzkaller... Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 14 +++++++++++--- include/uapi/linux/usb/ch9.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 5dc08c71c5b2..11793386b4e9 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -638,15 +638,23 @@ static int usb_parse_configuration(struct usb_device *dev, int cfgidx, } else if (header->bDescriptorType == USB_DT_INTERFACE_ASSOCIATION) { + struct usb_interface_assoc_descriptor *d; + + d = (struct usb_interface_assoc_descriptor *)header; + if (d->bLength < USB_DT_INTERFACE_ASSOCIATION_SIZE) { + dev_warn(ddev, + "config %d has an invalid interface association descriptor of length %d, skipping\n", + cfgno, d->bLength); + continue; + } + if (iad_num == USB_MAXIADS) { dev_warn(ddev, "found more Interface " "Association Descriptors " "than allocated for in " "configuration %d\n", cfgno); } else { - config->intf_assoc[iad_num] = - (struct usb_interface_assoc_descriptor - *)header; + config->intf_assoc[iad_num] = d; iad_num++; } diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index a8acc24765fe..5e64a86989a5 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -759,6 +759,7 @@ struct usb_interface_assoc_descriptor { __u8 iFunction; } __attribute__ ((packed)); +#define USB_DT_INTERFACE_ASSOCIATION_SIZE 8 /*-------------------------------------------------------------------------*/ From 67e752e1d60f9f40b8391dad48f7c842bdba6ec4 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:13 +0300 Subject: [PATCH 277/555] xhci: fix finding correct bus_state structure for USB 3.1 hosts commit 5a838a13c9b4e5dd188b7a6eaeb894e9358ead0c upstream. xhci driver keeps a bus_state structure for each hcd (usb2 and usb3) The structure is picked based on hcd speed, but driver only compared for HCD_USB3 speed, returning the wrong bus_state for HCD_USB31 hosts. This caused null pointer dereference errors in bus_resume function. Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index a0f4a9feb058..5b137ec5891a 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1509,7 +1509,7 @@ struct xhci_bus_state { static inline unsigned int hcd_index(struct usb_hcd *hcd) { - if (hcd->speed == HCD_USB3) + if (hcd->speed >= HCD_USB3) return 0; else return 1; From f1a04773d773900cd2f670b18c20c86a4218420f Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:17 +0300 Subject: [PATCH 278/555] xhci: Fix sleeping with spin_lock_irq() held in ASmedia 1042A workaround commit 4ec1cd3eeeee7ccc35681270da028dbc29ca7bbd upstream. The flow control workaround for ASM1042A xHC hosts sleeps between register polling. The workaround gets called in several places, among them with spin_lock_irq() held when xHC host is resumed or hoplug removed. This was noticed as kernel panics at resume on a Dell XPS15 9550 with TB16 thunderbolt dock. Avoid sleeping with spin_lock_irq() held, use udelay() instead The original workaround was added to 4.9 and 4.12 stable releases, this patch needs to be applied to those as well. Fixes: 9da5a1092b13 ("xhci: Bad Ethernet performance plugged in ASM1042A host") Reported-by: Jose Marino Tested-by: Jose Marino Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 31347a6ac146..ee213c5f4107 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -447,7 +447,7 @@ static int usb_asmedia_wait_write(struct pci_dev *pdev) if ((value & ASMT_CONTROL_WRITE_BIT) == 0) return 0; - usleep_range(40, 60); + udelay(50); } dev_warn(&pdev->dev, "%s: check_write_ready timeout", __func__); From f77615db8ae850ef196843502b1d0aaae8e173d2 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:18 +0300 Subject: [PATCH 279/555] xhci: set missing SuperSpeedPlus Link Protocol bit in roothub descriptor commit 7bea22b124d77845c85a62eaa29a85ba6cc2f899 upstream. A SuperSpeedPlus roothub needs to have the Link Protocol (LP) bit set in the bmSublinkSpeedAttr[] entry of a SuperSpeedPlus descriptor. If the xhci controller has an optional Protocol Speed ID (PSI) table then that will be used as a base to create the roothub SuperSpeedPlus descriptor. The PSI table does not however necessary contain the LP bit so we need to set it manually. Check the psi speed and set LP bit if speed is 10Gbps or higher. We're not setting it for 5 to 10Gbps as USB 3.1 specification always mention SuperSpeedPlus for 10Gbps or higher, and some SSIC USB 3.0 speeds can be over 5Gbps, such as SSIC-G3B-L1 at 5830 Mbps Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 36b7789f8f22..eb4312b12215 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -112,7 +112,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, /* If PSI table exists, add the custom speed attributes from it */ if (usb3_1 && xhci->usb3_rhub.psi_count) { - u32 ssp_cap_base, bm_attrib, psi; + u32 ssp_cap_base, bm_attrib, psi, psi_mant, psi_exp; int offset; ssp_cap_base = USB_DT_BOS_SIZE + USB_DT_USB_SS_CAP_SIZE; @@ -139,6 +139,15 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, for (i = 0; i < xhci->usb3_rhub.psi_count; i++) { psi = xhci->usb3_rhub.psi[i]; psi &= ~USB_SSP_SUBLINK_SPEED_RSVD; + psi_exp = XHCI_EXT_PORT_PSIE(psi); + psi_mant = XHCI_EXT_PORT_PSIM(psi); + + /* Shift to Gbps and set SSP Link BIT(14) if 10Gpbs */ + for (; psi_exp < 3; psi_exp++) + psi_mant /= 1000; + if (psi_mant >= 10) + psi |= BIT(14); + if ((psi & PLT_MASK) == PLT_SYM) { /* Symmetric, create SSA RX and TX from one PSI entry */ put_unaligned_le32(psi, &buf[offset]); From a13481f8cdca4e3f08d950e230f49742d95fc70c Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 18 Sep 2017 17:39:19 +0300 Subject: [PATCH 280/555] Revert "xhci: Limit USB2 port wake support for AMD Promontory hosts" commit bcd6a7aa13800afc1418e6b29d944d882214939a upstream. This reverts commit dec08194ffeccfa1cf085906b53d301930eae18f. Commit dec08194ffec ("xhci: Limit USB2 port wake support for AMD Promontory hosts") makes all high speed USB ports on ASUS PRIME B350M-A cease to function after enabling runtime PM. All boards with this chipsets will be affected, so revert the commit. The original patch was added to stable 4.9, 4.11 and 4.12 and needs to reverted from there as well Signed-off-by: Kai-Heng Feng Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 3 --- drivers/usb/host/xhci-pci.c | 12 ------------ drivers/usb/host/xhci.h | 2 +- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index eb4312b12215..4a02c5c7df0d 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -1360,9 +1360,6 @@ int xhci_bus_suspend(struct usb_hcd *hcd) t2 |= PORT_WKOC_E | PORT_WKCONN_E; t2 &= ~PORT_WKDISC_E; } - if ((xhci->quirks & XHCI_U2_DISABLE_WAKE) && - (hcd->speed < HCD_USB3)) - t2 &= ~PORT_WAKE_BITS; } else t2 &= ~PORT_WAKE_BITS; diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 23833448e602..c87ef38e7416 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -54,11 +54,6 @@ #define PCI_DEVICE_ID_INTEL_APL_XHCI 0x5aa8 #define PCI_DEVICE_ID_INTEL_DNV_XHCI 0x19d0 -#define PCI_DEVICE_ID_AMD_PROMONTORYA_4 0x43b9 -#define PCI_DEVICE_ID_AMD_PROMONTORYA_3 0x43ba -#define PCI_DEVICE_ID_AMD_PROMONTORYA_2 0x43bb -#define PCI_DEVICE_ID_AMD_PROMONTORYA_1 0x43bc - #define PCI_DEVICE_ID_ASMEDIA_1042A_XHCI 0x1142 static const char hcd_name[] = "xhci_hcd"; @@ -142,13 +137,6 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) if (pdev->vendor == PCI_VENDOR_ID_AMD) xhci->quirks |= XHCI_TRUST_TX_LENGTH; - if ((pdev->vendor == PCI_VENDOR_ID_AMD) && - ((pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_4) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_3) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_2) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_1))) - xhci->quirks |= XHCI_U2_DISABLE_WAKE; - if (pdev->vendor == PCI_VENDOR_ID_INTEL) { xhci->quirks |= XHCI_LPM_SUPPORT; xhci->quirks |= XHCI_INTEL_HOST; diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 5b137ec5891a..836398ade58d 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1660,7 +1660,7 @@ struct xhci_hcd { /* For controller with a broken Port Disable implementation */ #define XHCI_BROKEN_PORT_PED (1 << 25) #define XHCI_LIMIT_ENDPOINT_INTERVAL_7 (1 << 26) -#define XHCI_U2_DISABLE_WAKE (1 << 27) +/* Reserved. It was XHCI_U2_DISABLE_WAKE */ #define XHCI_ASMEDIA_MODIFY_FLOWCONTROL (1 << 28) unsigned int num_active_eps; From ab67661467857103cfd44b6c750d3dc9fb355cad Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:18 +0200 Subject: [PATCH 281/555] iio: adc: twl4030: Fix an error handling path in 'twl4030_madc_probe()' commit 245a396a9b1a67ac5c3228737c261b3e48708a2a upstream. If 'devm_regulator_get()' fails, we should go through the existing error handling path instead of returning directly, as done is all the other error handling paths in this function. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/twl4030-madc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index 0c74869a540a..79028c95b673 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -866,8 +866,10 @@ static int twl4030_madc_probe(struct platform_device *pdev) /* Enable 3v1 bias regulator for MADC[3:6] */ madc->usb3v1 = devm_regulator_get(madc->dev, "vusb3v1"); - if (IS_ERR(madc->usb3v1)) - return -ENODEV; + if (IS_ERR(madc->usb3v1)) { + ret = -ENODEV; + goto err_i2c; + } ret = regulator_enable(madc->usb3v1); if (ret) From a2002c92ffb3c378e4d0066d3fddb045acd939b9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:19 +0200 Subject: [PATCH 282/555] iio: adc: twl4030: Disable the vusb3v1 rugulator in the error handling path of 'twl4030_madc_probe()' commit 7f70be6e4025db0551e6863e7eb9cca07122695c upstream. Commit 7cc97d77ee8a has introduced a call to 'regulator_disable()' in the .remove function. So we should also have such a call in the .probe function in case of error after a successful 'regulator_enable()' call. Add a new label for that and use it. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/twl4030-madc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index 79028c95b673..7ffc5db4d7ee 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -878,11 +878,13 @@ static int twl4030_madc_probe(struct platform_device *pdev) ret = iio_device_register(iio_dev); if (ret) { dev_err(&pdev->dev, "could not register iio device\n"); - goto err_i2c; + goto err_usb3v1; } return 0; +err_usb3v1: + regulator_disable(madc->usb3v1); err_i2c: twl4030_madc_set_current_generator(madc, 0, 0); err_current_generator: From 1f266a130329fb1bcc61fbffeb7dfe847c157f4b Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:14:45 +0300 Subject: [PATCH 283/555] iio: ad_sigma_delta: Implement a dedicated reset function commit 7fc10de8d49a748c476532c9d8e8fe19e548dd67 upstream. Since most of the SD ADCs have the option of reseting the serial interface by sending a number of SCLKs with CS = 0 and DIN = 1, a dedicated function that can do this is usefull. Needed for the patch: iio: ad7793: Fix the serial interface reset Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/ad_sigma_delta.c | 28 ++++++++++++++++++++++++++ include/linux/iio/adc/ad_sigma_delta.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c index d10bd0c97233..22c4c17cd996 100644 --- a/drivers/iio/adc/ad_sigma_delta.c +++ b/drivers/iio/adc/ad_sigma_delta.c @@ -177,6 +177,34 @@ out: } EXPORT_SYMBOL_GPL(ad_sd_read_reg); +/** + * ad_sd_reset() - Reset the serial interface + * + * @sigma_delta: The sigma delta device + * @reset_length: Number of SCLKs with DIN = 1 + * + * Returns 0 on success, an error code otherwise. + **/ +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length) +{ + uint8_t *buf; + unsigned int size; + int ret; + + size = DIV_ROUND_UP(reset_length, 8); + buf = kcalloc(size, sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memset(buf, 0xff, size); + ret = spi_write(sigma_delta->spi, buf, size); + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(ad_sd_reset); + static int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta, unsigned int mode, unsigned int channel) { diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index e7fdec4db9da..6cc48ac55fd2 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -111,6 +111,9 @@ int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, unsigned int size, unsigned int *val); +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length); + int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, int *val); int ad_sd_calibrate_all(struct ad_sigma_delta *sigma_delta, From 8edd1ce3e56b4a383d60273d008d35d70e367956 Mon Sep 17 00:00:00 2001 From: Stefan Popa Date: Thu, 14 Sep 2017 16:50:28 +0300 Subject: [PATCH 284/555] staging: iio: ad7192: Fix - use the dedicated reset function avoiding dma from stack. commit f790923f146140a261ad211e5baf75d169f16fb2 upstream. Depends on: 691c4b95d1 ("iio: ad_sigma_delta: Implement a dedicated reset function") SPI host drivers can use DMA to transfer data, so the buffer should be properly allocated. Keeping it on the stack could cause an undefined behavior. The dedicated reset function solves this issue. Signed-off-by: Stefan Popa Acked-by: Lars-Peter Clausen Acked-by: Michael Hennerich Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/staging/iio/adc/ad7192.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/staging/iio/adc/ad7192.c b/drivers/staging/iio/adc/ad7192.c index 1cf6b79801a9..eeacb0e55db7 100644 --- a/drivers/staging/iio/adc/ad7192.c +++ b/drivers/staging/iio/adc/ad7192.c @@ -222,11 +222,9 @@ static int ad7192_setup(struct ad7192_state *st, struct iio_dev *indio_dev = spi_get_drvdata(st->sd.spi); unsigned long long scale_uv; int i, ret, id; - u8 ones[6]; /* reset the serial interface */ - memset(&ones, 0xFF, 6); - ret = spi_write(st->sd.spi, &ones, 6); + ret = ad_sd_reset(&st->sd, 48); if (ret < 0) goto out; usleep_range(500, 1000); /* Wait for at least 500us */ From 9af1bd5e705a5a9dbeff42bab077afe3ba881968 Mon Sep 17 00:00:00 2001 From: Matt Fornero Date: Tue, 5 Sep 2017 16:34:10 +0200 Subject: [PATCH 285/555] iio: core: Return error for failed read_reg commit 3d62c78a6eb9a7d67bace9622b66ad51e81c5f9b upstream. If an IIO device returns an error code for a read access via debugfs, it is currently ignored by the IIO core (other than emitting an error message). Instead, return this error code to user space, so upper layers can detect it correctly. Signed-off-by: Matt Fornero Signed-off-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/industrialio-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index fc340ed3dca1..c5bc73135436 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -306,8 +306,10 @@ static ssize_t iio_debugfs_read_reg(struct file *file, char __user *userbuf, ret = indio_dev->info->debugfs_reg_access(indio_dev, indio_dev->cached_reg_addr, 0, &val); - if (ret) + if (ret) { dev_err(indio_dev->dev.parent, "%s: read failed\n", __func__); + return ret; + } len = snprintf(buf, sizeof(buf), "0x%X\n", val); From f0865d60f3a54bd66356026c83cfa9ec564b18e3 Mon Sep 17 00:00:00 2001 From: Colin Parker Date: Mon, 28 Aug 2017 16:21:39 -0700 Subject: [PATCH 286/555] IIO: BME280: Updates to Humidity readings need ctrl_reg write! commit 4b1f0c31f96c45e8521dd84aae50f2aa4aecfb7b upstream. The ctrl_reg register needs to be written after any write to the humidity registers. The value written to the ctrl_reg register does not necessarily need to change, but a write operation must occur. The regmap_update_bits functions will not write to a register if the register value matches the value to be written. This saves unnecessary bus operations. The change in this patch forces a bus write during the chip_config operation by switching to regmap_write_bits. This will fix issues where the Humidity Sensor Oversampling bits are not updated after initialization. Signed-off-by: Colin Parker Acked-by: Andreas Klinger Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/pressure/bmp280-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/pressure/bmp280-core.c b/drivers/iio/pressure/bmp280-core.c index f762eb8b174a..19aa957bd454 100644 --- a/drivers/iio/pressure/bmp280-core.c +++ b/drivers/iio/pressure/bmp280-core.c @@ -558,7 +558,7 @@ static int bmp280_chip_config(struct bmp280_data *data) u8 osrs = BMP280_OSRS_TEMP_X(data->oversampling_temp + 1) | BMP280_OSRS_PRESS_X(data->oversampling_press + 1); - ret = regmap_update_bits(data->regmap, BMP280_REG_CTRL_MEAS, + ret = regmap_write_bits(data->regmap, BMP280_REG_CTRL_MEAS, BMP280_OSRS_TEMP_MASK | BMP280_OSRS_PRESS_MASK | BMP280_MODE_MASK, From 8b97d5b67e9ee4394d52a7b63086a4400d7dbf5a Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:16:13 +0300 Subject: [PATCH 287/555] iio: ad7793: Fix the serial interface reset commit 7ee3b7ebcb74714df6d94c8f500f307e1ee5dda5 upstream. The serial interface can be reset by writing 32 consecutive 1s to the device. 'ret' was initialized correctly but its value was overwritten when ad7793_check_platform_data() was called. Since a dedicated reset function is present now, it should be used instead. Fixes: 2edb769d246e ("iio:ad7793: Add support for the ad7798 and ad7799") Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/ad7793.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/ad7793.c b/drivers/iio/adc/ad7793.c index e6706a09e100..47c3d7f32900 100644 --- a/drivers/iio/adc/ad7793.c +++ b/drivers/iio/adc/ad7793.c @@ -257,7 +257,7 @@ static int ad7793_setup(struct iio_dev *indio_dev, unsigned int vref_mv) { struct ad7793_state *st = iio_priv(indio_dev); - int i, ret = -1; + int i, ret; unsigned long long scale_uv; u32 id; @@ -266,7 +266,7 @@ static int ad7793_setup(struct iio_dev *indio_dev, return ret; /* reset the serial interface */ - ret = spi_write(st->sd.spi, (u8 *)&ret, sizeof(ret)); + ret = ad_sd_reset(&st->sd, 32); if (ret < 0) goto out; usleep_range(500, 2000); /* Wait for at least 500us */ From 1daa7c5aba21b63bb0a6135b0cb4213ef383658e Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 22 Aug 2017 15:33:00 +0200 Subject: [PATCH 288/555] iio: adc: mcp320x: Fix readout of negative voltages commit e6f4794371ee7cce1339e7ca9542f1e703c5f84a upstream. Commit f686a36b4b79 ("iio: adc: mcp320x: Add support for mcp3301") returns a signed voltage from mcp320x_adc_conversion() but neglects that the caller interprets a negative return value as failure. Only mcp3301 (and the upcoming mcp3550/1/3) is affected as the other chips are incapable of measuring negative voltages. Fix and while at it, add mcp3301 to the list of supported chips at the top of the file. Fixes: f686a36b4b79 ("iio: adc: mcp320x: Add support for mcp3301") Cc: Andrea Galbusera Signed-off-by: Lukas Wunner Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/mcp320x.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c index 634717ae12f3..e90c2c111cb3 100644 --- a/drivers/iio/adc/mcp320x.c +++ b/drivers/iio/adc/mcp320x.c @@ -17,6 +17,8 @@ * MCP3204 * MCP3208 * ------------ + * 13 bit converter + * MCP3301 * * Datasheet can be found here: * http://ww1.microchip.com/downloads/en/DeviceDoc/21293C.pdf mcp3001 @@ -96,7 +98,7 @@ static int mcp320x_channel_to_tx_data(int device_index, } static int mcp320x_adc_conversion(struct mcp320x *adc, u8 channel, - bool differential, int device_index) + bool differential, int device_index, int *val) { int ret; @@ -117,19 +119,25 @@ static int mcp320x_adc_conversion(struct mcp320x *adc, u8 channel, switch (device_index) { case mcp3001: - return (adc->rx_buf[0] << 5 | adc->rx_buf[1] >> 3); + *val = (adc->rx_buf[0] << 5 | adc->rx_buf[1] >> 3); + return 0; case mcp3002: case mcp3004: case mcp3008: - return (adc->rx_buf[0] << 2 | adc->rx_buf[1] >> 6); + *val = (adc->rx_buf[0] << 2 | adc->rx_buf[1] >> 6); + return 0; case mcp3201: - return (adc->rx_buf[0] << 7 | adc->rx_buf[1] >> 1); + *val = (adc->rx_buf[0] << 7 | adc->rx_buf[1] >> 1); + return 0; case mcp3202: case mcp3204: case mcp3208: - return (adc->rx_buf[0] << 4 | adc->rx_buf[1] >> 4); + *val = (adc->rx_buf[0] << 4 | adc->rx_buf[1] >> 4); + return 0; case mcp3301: - return sign_extend32((adc->rx_buf[0] & 0x1f) << 8 | adc->rx_buf[1], 12); + *val = sign_extend32((adc->rx_buf[0] & 0x1f) << 8 + | adc->rx_buf[1], 12); + return 0; default: return -EINVAL; } @@ -150,12 +158,10 @@ static int mcp320x_read_raw(struct iio_dev *indio_dev, switch (mask) { case IIO_CHAN_INFO_RAW: ret = mcp320x_adc_conversion(adc, channel->address, - channel->differential, device_index); - + channel->differential, device_index, val); if (ret < 0) goto out; - *val = ret; ret = IIO_VAL_INT; break; From ec8a7153bbf3742658ca71ab536505254d04cabd Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 22 Aug 2017 15:33:00 +0200 Subject: [PATCH 289/555] iio: adc: mcp320x: Fix oops on module unload commit 0964e40947a630a2a6f724e968246992f97bcf1c upstream. The driver calls spi_get_drvdata() in its ->remove hook even though it has never called spi_set_drvdata(). Stack trace for posterity: Unable to handle kernel NULL pointer dereference at virtual address 00000220 Internal error: Oops: 5 [#1] SMP ARM [<8072f564>] (mutex_lock) from [<7f1400d0>] (iio_device_unregister+0x24/0x7c [industrialio]) [<7f1400d0>] (iio_device_unregister [industrialio]) from [<7f15e020>] (mcp320x_remove+0x20/0x30 [mcp320x]) [<7f15e020>] (mcp320x_remove [mcp320x]) from [<8055a8cc>] (spi_drv_remove+0x2c/0x44) [<8055a8cc>] (spi_drv_remove) from [<805087bc>] (__device_release_driver+0x98/0x134) [<805087bc>] (__device_release_driver) from [<80509180>] (driver_detach+0xdc/0xe0) [<80509180>] (driver_detach) from [<8050823c>] (bus_remove_driver+0x5c/0xb0) [<8050823c>] (bus_remove_driver) from [<80509ab0>] (driver_unregister+0x38/0x58) [<80509ab0>] (driver_unregister) from [<7f15e69c>] (mcp320x_driver_exit+0x14/0x1c [mcp320x]) [<7f15e69c>] (mcp320x_driver_exit [mcp320x]) from [<801a78d0>] (SyS_delete_module+0x184/0x1d0) [<801a78d0>] (SyS_delete_module) from [<80108100>] (ret_fast_syscall+0x0/0x1c) Fixes: f5ce4a7a9291 ("iio: adc: add driver for MCP3204/08 12-bit ADC") Cc: Oskar Andero Signed-off-by: Lukas Wunner Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/mcp320x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c index e90c2c111cb3..071dd23a33d9 100644 --- a/drivers/iio/adc/mcp320x.c +++ b/drivers/iio/adc/mcp320x.c @@ -318,6 +318,7 @@ static int mcp320x_probe(struct spi_device *spi) indio_dev->name = spi_get_device_id(spi)->name; indio_dev->modes = INDIO_DIRECT_MODE; indio_dev->info = &mcp320x_info; + spi_set_drvdata(spi, indio_dev); chip_info = &mcp320x_chip_infos[spi_get_device_id(spi)->driver_data]; indio_dev->channels = chip_info->channels; From 8ff7adb930d4a62f43dfc76220a988a043c510ff Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 14 Sep 2017 14:30:55 +0200 Subject: [PATCH 290/555] uwb: properly check kthread_run return value commit bbf26183b7a6236ba602f4d6a2f7cade35bba043 upstream. uwbd_start() calls kthread_run() and checks that the return value is not NULL. But the return value is not NULL in case kthread_run() fails, it takes the form of ERR_PTR(-EINTR). Use IS_ERR() instead. Also add a check to uwbd_stop(). Signed-off-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/uwb/uwbd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c index 01c20a260a8b..39dd4ef53c77 100644 --- a/drivers/uwb/uwbd.c +++ b/drivers/uwb/uwbd.c @@ -302,18 +302,22 @@ static int uwbd(void *param) /** Start the UWB daemon */ void uwbd_start(struct uwb_rc *rc) { - rc->uwbd.task = kthread_run(uwbd, rc, "uwbd"); - if (rc->uwbd.task == NULL) + struct task_struct *task = kthread_run(uwbd, rc, "uwbd"); + if (IS_ERR(task)) { + rc->uwbd.task = NULL; printk(KERN_ERR "UWB: Cannot start management daemon; " "UWB won't work\n"); - else + } else { + rc->uwbd.task = task; rc->uwbd.pid = rc->uwbd.task->pid; + } } /* Stop the UWB daemon and free any unprocessed events */ void uwbd_stop(struct uwb_rc *rc) { - kthread_stop(rc->uwbd.task); + if (rc->uwbd.task) + kthread_stop(rc->uwbd.task); uwbd_flush(rc); } From 8928c5b2d318cd591d499c94ed4dd5c920f5016f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 14 Sep 2017 16:52:59 +0200 Subject: [PATCH 291/555] uwb: ensure that endpoint is interrupt commit 70e743e4cec3733dc13559f6184b35d358b9ef3f upstream. hwarc_neep_init() assumes that endpoint 0 is interrupt, but there's no check for that, which results in a WARNING in USB core code, when a bad USB descriptor is provided from a device: usb 1-1: BOGUS urb xfer, pipe 1 != type 3 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3 at drivers/usb/core/urb.c:449 usb_submit_urb+0xf8a/0x11d0 Modules linked in: CPU: 0 PID: 3 Comm: kworker/0:0 Not tainted 4.13.0+ #111 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event task: ffff88006bdc1a00 task.stack: ffff88006bde8000 RIP: 0010:usb_submit_urb+0xf8a/0x11d0 drivers/usb/core/urb.c:448 RSP: 0018:ffff88006bdee3c0 EFLAGS: 00010282 RAX: 0000000000000029 RBX: ffff8800672a7200 RCX: 0000000000000000 RDX: 0000000000000029 RSI: ffff88006c815c78 RDI: ffffed000d7bdc6a RBP: ffff88006bdee4c0 R08: fffffbfff0fe00ff R09: fffffbfff0fe00ff R10: 0000000000000018 R11: fffffbfff0fe00fe R12: 1ffff1000d7bdc7f R13: 0000000000000003 R14: 0000000000000001 R15: ffff88006b02cc90 FS: 0000000000000000(0000) GS:ffff88006c800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe4daddf000 CR3: 000000006add6000 CR4: 00000000000006f0 Call Trace: hwarc_neep_init+0x4ce/0x9c0 drivers/uwb/hwa-rc.c:710 uwb_rc_add+0x2fb/0x730 drivers/uwb/lc-rc.c:361 hwarc_probe+0x34e/0x9b0 drivers/uwb/hwa-rc.c:858 usb_probe_interface+0x351/0x8d0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:385 driver_probe_device+0x610/0xa00 drivers/base/dd.c:529 __device_attach_driver+0x230/0x290 drivers/base/dd.c:625 bus_for_each_drv+0x15e/0x210 drivers/base/bus.c:463 __device_attach+0x269/0x3c0 drivers/base/dd.c:682 device_initial_probe+0x1f/0x30 drivers/base/dd.c:729 bus_probe_device+0x1da/0x280 drivers/base/bus.c:523 device_add+0xcf9/0x1640 drivers/base/core.c:1703 usb_set_configuration+0x1064/0x1890 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:385 driver_probe_device+0x610/0xa00 drivers/base/dd.c:529 __device_attach_driver+0x230/0x290 drivers/base/dd.c:625 bus_for_each_drv+0x15e/0x210 drivers/base/bus.c:463 __device_attach+0x269/0x3c0 drivers/base/dd.c:682 device_initial_probe+0x1f/0x30 drivers/base/dd.c:729 bus_probe_device+0x1da/0x280 drivers/base/bus.c:523 device_add+0xcf9/0x1640 drivers/base/core.c:1703 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4890 hub_port_connect_change drivers/usb/core/hub.c:4996 port_event drivers/usb/core/hub.c:5102 hub_event+0x23c8/0x37c0 drivers/usb/core/hub.c:5182 process_one_work+0x9fb/0x1570 kernel/workqueue.c:2097 worker_thread+0x1e4/0x1350 kernel/workqueue.c:2231 kthread+0x324/0x3f0 kernel/kthread.c:231 ret_from_fork+0x25/0x30 arch/x86/entry/entry_64.S:425 Code: 48 8b 85 30 ff ff ff 48 8d b8 98 00 00 00 e8 8e 93 07 ff 45 89 e8 44 89 f1 4c 89 fa 48 89 c6 48 c7 c7 a0 e5 55 86 e8 20 08 8f fd <0f> ff e9 9b f7 ff ff e8 4a 04 d6 fd e9 80 f7 ff ff e8 60 11 a6 ---[ end trace 55d741234124cfc3 ]--- Check that endpoint is interrupt. Found by syzkaller. Signed-off-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/uwb/hwa-rc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c index 35a1e777b449..9a53912bdfe9 100644 --- a/drivers/uwb/hwa-rc.c +++ b/drivers/uwb/hwa-rc.c @@ -825,6 +825,8 @@ static int hwarc_probe(struct usb_interface *iface, if (iface->cur_altsetting->desc.bNumEndpoints < 1) return -ENODEV; + if (!usb_endpoint_xfer_int(&iface->cur_altsetting->endpoint[0].desc)) + return -ENODEV; result = -ENOMEM; uwb_rc = uwb_rc_alloc(); From 8a056a1152707567ecfe6a6f26b8414606150936 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sun, 3 Sep 2017 19:06:31 +0200 Subject: [PATCH 292/555] staging: vchiq_2835_arm: Fix NULL ptr dereference in free_pagelist commit 974d4d03fc020af4fa4e9e72a86f0fefa37803c5 upstream. This fixes a NULL pointer dereference on RPi 2 with multi_v7_defconfig. The function page_address() could return NULL with enabled CONFIG_HIGHMEM. So fix this by using kmap() instead. Signed-off-by: Stefan Wahren Fixes: 71bad7f08641 ("staging: add bcm2708 vchiq driver") Signed-off-by: Greg Kroah-Hartman --- .../vc04_services/interface/vchiq_arm/vchiq_2835_arm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c index 1091b9f1dd07..6d459ef8c121 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c @@ -538,18 +538,20 @@ free_pagelist(PAGELIST_T *pagelist, int actual) if (head_bytes > actual) head_bytes = actual; - memcpy((char *)page_address(pages[0]) + + memcpy((char *)kmap(pages[0]) + pagelist->offset, fragments, head_bytes); + kunmap(pages[0]); } if ((actual >= 0) && (head_bytes < actual) && (tail_bytes != 0)) { - memcpy((char *)page_address(pages[num_pages - 1]) + + memcpy((char *)kmap(pages[num_pages - 1]) + ((pagelist->offset + actual) & (PAGE_SIZE - 1) & ~(g_cache_line_size - 1)), fragments + g_cache_line_size, tail_bytes); + kunmap(pages[num_pages - 1]); } down(&g_free_fragments_mutex); From 2b8197073a0f2c5d14c4e3ed5934b8f6e51eeeb7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:14:50 -0700 Subject: [PATCH 293/555] mm, oom_reaper: skip mm structs with mmu notifiers commit 4d4bbd8526a8fbeb2c090ea360211fceff952383 upstream. Andrea has noticed that the oom_reaper doesn't invalidate the range via mmu notifiers (mmu_notifier_invalidate_range_start/end) and that can corrupt the memory of the kvm guest for example. tlb_flush_mmu_tlbonly already invokes mmu notifiers but that is not sufficient as per Andrea: "mmu_notifier_invalidate_range cannot be used in replacement of mmu_notifier_invalidate_range_start/end. For KVM mmu_notifier_invalidate_range is a noop and rightfully so. A MMU notifier implementation has to implement either ->invalidate_range method or the invalidate_range_start/end methods, not both. And if you implement invalidate_range_start/end like KVM is forced to do, calling mmu_notifier_invalidate_range in common code is a noop for KVM. For those MMU notifiers that can get away only implementing ->invalidate_range, the ->invalidate_range is implicitly called by mmu_notifier_invalidate_range_end(). And only those secondary MMUs that share the same pagetable with the primary MMU (like AMD iommuv2) can get away only implementing ->invalidate_range" As the callback is allowed to sleep and the implementation is out of hand of the MM it is safer to simply bail out if there is an mmu notifier registered. In order to not fail too early make the mm_has_notifiers check under the oom_lock and have a little nap before failing to give the current oom victim some more time to exit. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170913113427.2291-1-mhocko@kernel.org Fixes: aac453635549 ("mm, oom: introduce oom reaper") Signed-off-by: Michal Hocko Reported-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mmu_notifier.h | 5 +++++ mm/oom_kill.c | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index a1a210d59961..25c0dc31f084 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -419,6 +419,11 @@ extern void mmu_notifier_synchronize(void); #else /* CONFIG_MMU_NOTIFIER */ +static inline int mm_has_notifiers(struct mm_struct *mm) +{ + return 0; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ec9f11d4f094..d631d251c150 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "internal.h" @@ -490,6 +491,21 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto unlock_oom; } + /* + * If the mm has notifiers then we would need to invalidate them around + * unmap_page_range and that is risky because notifiers can sleep and + * what they do is basically undeterministic. So let's have a short + * sleep to give the oom victim some more time. + * TODO: we really want to get rid of this ugly hack and make sure that + * notifiers cannot block for unbounded amount of time and add + * mmu_notifier_invalidate_range_{start,end} around unmap_page_range + */ + if (mm_has_notifiers(mm)) { + up_read(&mm->mmap_sem); + schedule_timeout_idle(HZ); + goto unlock_oom; + } + /* * increase mm_users only after we know we will reap something so * that the mmput_async is called only when we have reaped something From 1c0891295a5a49f63ccb38f9aeed664ce63eb404 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 3 Oct 2017 16:16:45 -0700 Subject: [PATCH 294/555] lib/ratelimit.c: use deferred printk() version commit 656d61ce9666209c4c4a13c71902d3ee70d1ff6f upstream. printk_ratelimit() invokes ___ratelimit() which may invoke a normal printk() (pr_warn() in this particular case) to warn about suppressed output. Given that printk_ratelimit() may be called from anywhere, that pr_warn() is dangerous - it may end up deadlocking the system. Fix ___ratelimit() by using deferred printk(). Sasha reported the following lockdep error: : Unregister pv shared memory for cpu 8 : select_fallback_rq: 3 callbacks suppressed : process 8583 (trinity-c78) no longer affine to cpu8 : : ====================================================== : WARNING: possible circular locking dependency detected : 4.14.0-rc2-next-20170927+ #252 Not tainted : ------------------------------------------------------ : migration/8/62 is trying to acquire lock: : (&port_lock_key){-.-.}, at: serial8250_console_write() : : but task is already holding lock: : (&rq->lock){-.-.}, at: sched_cpu_dying() : : which lock already depends on the new lock. : : : the existing dependency chain (in reverse order) is: : : -> #3 (&rq->lock){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock() : task_fork_fair() : sched_fork() : copy_process.part.31() : _do_fork() : kernel_thread() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #2 (&p->pi_lock){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : try_to_wake_up() : default_wake_function() : woken_wake_function() : __wake_up_common() : __wake_up_common_lock() : __wake_up() : tty_wakeup() : tty_port_default_wakeup() : tty_port_tty_wakeup() : uart_write_wakeup() : serial8250_tx_chars() : serial8250_handle_irq.part.25() : serial8250_default_handle_irq() : serial8250_interrupt() : __handle_irq_event_percpu() : handle_irq_event_percpu() : handle_irq_event() : handle_level_irq() : handle_irq() : do_IRQ() : ret_from_intr() : native_safe_halt() : default_idle() : arch_cpu_idle() : default_idle_call() : do_idle() : cpu_startup_entry() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #1 (&tty->write_wait){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : __wake_up_common_lock() : __wake_up() : tty_wakeup() : tty_port_default_wakeup() : tty_port_tty_wakeup() : uart_write_wakeup() : serial8250_tx_chars() : serial8250_handle_irq.part.25() : serial8250_default_handle_irq() : serial8250_interrupt() : __handle_irq_event_percpu() : handle_irq_event_percpu() : handle_irq_event() : handle_level_irq() : handle_irq() : do_IRQ() : ret_from_intr() : native_safe_halt() : default_idle() : arch_cpu_idle() : default_idle_call() : do_idle() : cpu_startup_entry() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #0 (&port_lock_key){-.-.}: : check_prev_add() : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : serial8250_console_write() : univ8250_console_write() : console_unlock() : vprintk_emit() : vprintk_default() : vprintk_func() : printk() : ___ratelimit() : __printk_ratelimit() : select_fallback_rq() : sched_cpu_dying() : cpuhp_invoke_callback() : take_cpu_down() : multi_cpu_stop() : cpu_stopper_thread() : smpboot_thread_fn() : kthread() : ret_from_fork() : : other info that might help us debug this: : : Chain exists of: : &port_lock_key --> &p->pi_lock --> &rq->lock : : Possible unsafe locking scenario: : : CPU0 CPU1 : ---- ---- : lock(&rq->lock); : lock(&p->pi_lock); : lock(&rq->lock); : lock(&port_lock_key); : : *** DEADLOCK *** : : 4 locks held by migration/8/62: : #0: (&p->pi_lock){-.-.}, at: sched_cpu_dying() : #1: (&rq->lock){-.-.}, at: sched_cpu_dying() : #2: (printk_ratelimit_state.lock){....}, at: ___ratelimit() : #3: (console_lock){+.+.}, at: vprintk_emit() : : stack backtrace: : CPU: 8 PID: 62 Comm: migration/8 Not tainted 4.14.0-rc2-next-20170927+ #252 : Call Trace: : dump_stack() : print_circular_bug() : check_prev_add() : ? add_lock_to_list.isra.26() : ? check_usage() : ? kvm_clock_read() : ? kvm_sched_clock_read() : ? sched_clock() : ? check_preemption_disabled() : __lock_acquire() : ? __lock_acquire() : ? add_lock_to_list.isra.26() : ? debug_check_no_locks_freed() : ? memcpy() : lock_acquire() : ? serial8250_console_write() : _raw_spin_lock_irqsave() : ? serial8250_console_write() : serial8250_console_write() : ? serial8250_start_tx() : ? lock_acquire() : ? memcpy() : univ8250_console_write() : console_unlock() : ? __down_trylock_console_sem() : vprintk_emit() : vprintk_default() : vprintk_func() : printk() : ? show_regs_print_info() : ? lock_acquire() : ___ratelimit() : __printk_ratelimit() : select_fallback_rq() : sched_cpu_dying() : ? sched_cpu_starting() : ? rcutree_dying_cpu() : ? sched_cpu_starting() : cpuhp_invoke_callback() : ? cpu_disable_common() : take_cpu_down() : ? trace_hardirqs_off_caller() : ? cpuhp_invoke_callback() : multi_cpu_stop() : ? __this_cpu_preempt_check() : ? cpu_stop_queue_work() : cpu_stopper_thread() : ? cpu_stop_create() : smpboot_thread_fn() : ? sort_range() : ? schedule() : ? __kthread_parkme() : kthread() : ? sort_range() : ? kthread_create_on_node() : ret_from_fork() : process 9121 (trinity-c78) no longer affine to cpu8 : smpboot: CPU 8 is now offline Link: http://lkml.kernel.org/r/20170928120405.18273-1-sergey.senozhatsky@gmail.com Fixes: 6b1d174b0c27b ("ratelimit: extend to print suppressed messages on release") Signed-off-by: Sergey Senozhatsky Reported-by: Sasha Levin Reviewed-by: Petr Mladek Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- lib/ratelimit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 08f8043cac61..d01f47135239 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -48,7 +48,9 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) if (time_is_before_jiffies(rs->begin + rs->interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { - pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); + printk_deferred(KERN_WARNING + "%s: %d callbacks suppressed\n", + func, rs->missed); rs->missed = 0; } } From 88c195d638d3f84e560ea4283b364499e9acb95b Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 19 Sep 2017 09:39:08 -0700 Subject: [PATCH 295/555] lsm: fix smack_inode_removexattr and xattr_getsecurity memleak commit 57e7ba04d422c3d41c8426380303ec9b7533ded9 upstream. security_inode_getsecurity() provides the text string value of a security attribute. It does not provide a "secctx". The code in xattr_getsecurity() that calls security_inode_getsecurity() and then calls security_release_secctx() happened to work because SElinux and Smack treat the attribute and the secctx the same way. It fails for cap_inode_getsecurity(), because that module has no secctx that ever needs releasing. It turns out that Smack is the one that's doing things wrong by not allocating memory when instructed to do so by the "alloc" parameter. The fix is simple enough. Change the security_release_secctx() to kfree() because it isn't a secctx being returned by security_inode_getsecurity(). Change Smack to allocate the string when told to do so. Note: this also fixes memory leaks for LSMs which implement inode_getsecurity but not release_secctx, such as capabilities. Signed-off-by: Casey Schaufler Reported-by: Konstantin Khlebnikov Signed-off-by: James Morris Signed-off-by: Greg Kroah-Hartman --- fs/xattr.c | 2 +- security/smack/smack_lsm.c | 59 +++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 33 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index ed8c374570ed..932b9061a3a2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -249,7 +249,7 @@ xattr_getsecurity(struct inode *inode, const char *name, void *value, } memcpy(value, buffer, len); out: - security_release_secctx(buffer, len); + kfree(buffer); out_noalloc: return len; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 1cb060293505..a8a7fbc377f6 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1486,7 +1486,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name) * @inode: the object * @name: attribute name * @buffer: where to put the result - * @alloc: unused + * @alloc: duplicate memory * * Returns the size of the attribute or an error code */ @@ -1499,43 +1499,38 @@ static int smack_inode_getsecurity(struct inode *inode, struct super_block *sbp; struct inode *ip = (struct inode *)inode; struct smack_known *isp; - int ilen; - int rc = 0; - if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { + if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) isp = smk_of_inode(inode); - ilen = strlen(isp->smk_known); - *buffer = isp->smk_known; - return ilen; + else { + /* + * The rest of the Smack xattrs are only on sockets. + */ + sbp = ip->i_sb; + if (sbp->s_magic != SOCKFS_MAGIC) + return -EOPNOTSUPP; + + sock = SOCKET_I(ip); + if (sock == NULL || sock->sk == NULL) + return -EOPNOTSUPP; + + ssp = sock->sk->sk_security; + + if (strcmp(name, XATTR_SMACK_IPIN) == 0) + isp = ssp->smk_in; + else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) + isp = ssp->smk_out; + else + return -EOPNOTSUPP; } - /* - * The rest of the Smack xattrs are only on sockets. - */ - sbp = ip->i_sb; - if (sbp->s_magic != SOCKFS_MAGIC) - return -EOPNOTSUPP; - - sock = SOCKET_I(ip); - if (sock == NULL || sock->sk == NULL) - return -EOPNOTSUPP; - - ssp = sock->sk->sk_security; - - if (strcmp(name, XATTR_SMACK_IPIN) == 0) - isp = ssp->smk_in; - else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) - isp = ssp->smk_out; - else - return -EOPNOTSUPP; - - ilen = strlen(isp->smk_known); - if (rc == 0) { - *buffer = isp->smk_known; - rc = ilen; + if (alloc) { + *buffer = kstrdup(isp->smk_known, GFP_KERNEL); + if (*buffer == NULL) + return -ENOMEM; } - return rc; + return strlen(isp->smk_known); } From 984b6c96f1e2bf2ccfc7c8755e936185c68f4054 Mon Sep 17 00:00:00 2001 From: Guneshwor Singh Date: Thu, 14 Sep 2017 17:49:40 +0530 Subject: [PATCH 296/555] ALSA: compress: Remove unused variable commit a931b9ce93841a5b66b709ba5a244276e345e63b upstream. Commit 04c5d5a430fc ("ALSA: compress: Embed struct device") removed the statement that used 'str' but didn't remove the variable itself. So remove it. [Adding stable to Cc since pr_debug() may refer to the uninitialized buffer -- tiwai] Fixes: 04c5d5a430fc ("ALSA: compress: Embed struct device") Signed-off-by: Guneshwor Singh Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/compress_offload.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index fec1dfdb14ad..4490a699030b 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -948,14 +948,13 @@ static const struct file_operations snd_compr_file_ops = { static int snd_compress_dev_register(struct snd_device *device) { int ret = -EINVAL; - char str[16]; struct snd_compr *compr; if (snd_BUG_ON(!device || !device->device_data)) return -EBADFD; compr = device->device_data; - pr_debug("reg %s for device %s, direction %d\n", str, compr->name, + pr_debug("reg device %s, direction %d\n", compr->name, compr->direction); /* register compressed device */ ret = snd_register_device(SNDRV_DEVICE_TYPE_COMPRESS, From 40e219327fd42981dafe817bd1ae88679bea20e7 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 26 Sep 2017 09:11:49 +0900 Subject: [PATCH 297/555] Revert "ALSA: echoaudio: purge contradictions between dimension matrix members and total number of members" commit 51db452df07bb4c5754b73789253ba21681d9dc2 upstream. This reverts commit 275353bb684e to fix a regression which can abort 'alsactl' program in alsa-utils due to assertion in alsa-lib. alsactl: control.c:2513: snd_ctl_elem_value_get_integer: Assertion `idx < sizeof(obj->value.integer.value) / sizeof(obj->value.integer.value[0])' failed. alsactl: control.c:2976: snd_ctl_elem_value_get_integer: Assertion `idx < ARRAY_SIZE(obj->value.integer.value)' failed. This commit is a band-aid. In a point of usage of ALSA control interface, the drivers still bring an issue that they prevent userspace applications to have a consistent way to parse each levels of the dimension information via ALSA control interface. Let me investigate this issue. Current implementation of the drivers have three control element sets with dimension information: * 'Monitor Mixer Volume' (type: integer) * 'VMixer Volume' (type: integer) * 'VU-meters' (type: boolean) Although the number of elements named as 'Monitor Mixer Volume' differs depending on drivers in this group, it can be calculated by macros defined by each driver (= (BX_NUM - BX_ANALOG_IN) * BX_ANALOG_IN). Each of the elements has one member for value and has dimension information with 2 levels (= BX_ANALOG_IN * (BX_NUM - BX_ANALOG_IN)). For these elements, userspace applications are expected to handle the dimension information so that all of the elements construct a matrix where the number of rows and columns are represented by the dimension information. The same way is applied to elements named as 'VMixer Volume'. The number of these elements can also be calculated by macros defined by each drivers (= PX_ANALOG_IN * BX_ANALOG_IN). Each of the element has one member for value and has dimension information with 2 levels (= BX_ANALOG_IN * PX_ANALOG_IN). All of the elements construct a matrix with the dimension information. An element named as 'VU-meters' gets a different way in a point of dimension information. The element includes 96 members for value. The element has dimension information with 3 levels (= 3 or 2 * 16 * 2). For this element, userspace applications are expected to handle the dimension information so that all of the members for value construct a matrix where the number of rows and columns are represented by the dimension information. This is different from the way for the former. As a summary, the drivers were not designed to produce a consistent way to parse the dimension information. This makes it hard for general userspace applications such as amixer to parse the information by a consistent way, and actually no userspace applications except for 'echomixer' utilize the dimension information. Additionally, no drivers excluding this group use the information. The reverted commit was written based on the latter way. A commit 860c1994a70a ('ALSA: control: add dimension validator for userspace elements') is written based on the latter way, too. The patch should be reconsider too in the same time to re-define a consistent way to parse the dimension information. Reported-by: Mark Hills Reported-by: S. Christian Collins Fixes: 275353bb684e ('ALSA: echoaudio: purge contradictions between dimension matrix members and total number of members') Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/echoaudio/echoaudio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/pci/echoaudio/echoaudio.c b/sound/pci/echoaudio/echoaudio.c index 937071760bc4..286f5e3686a3 100644 --- a/sound/pci/echoaudio/echoaudio.c +++ b/sound/pci/echoaudio/echoaudio.c @@ -1272,11 +1272,11 @@ static int snd_echo_mixer_info(struct snd_kcontrol *kcontrol, chip = snd_kcontrol_chip(kcontrol); uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 1; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = ECHOGAIN_MAXOUT; uinfo->dimen.d[0] = num_busses_out(chip); uinfo->dimen.d[1] = num_busses_in(chip); - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1]; return 0; } @@ -1344,11 +1344,11 @@ static int snd_echo_vmixer_info(struct snd_kcontrol *kcontrol, chip = snd_kcontrol_chip(kcontrol); uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 1; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = ECHOGAIN_MAXOUT; uinfo->dimen.d[0] = num_busses_out(chip); uinfo->dimen.d[1] = num_pipes_out(chip); - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1]; return 0; } @@ -1728,6 +1728,7 @@ static int snd_echo_vumeters_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 96; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = 0; #ifdef ECHOCARD_HAS_VMIXER @@ -1737,7 +1738,6 @@ static int snd_echo_vumeters_info(struct snd_kcontrol *kcontrol, #endif uinfo->dimen.d[1] = 16; /* 16 channels */ uinfo->dimen.d[2] = 2; /* 0=level, 1=peak */ - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1] * uinfo->dimen.d[2]; return 0; } From 065af12fd1393cdbfebf735d7a7bcce03e84d807 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 2 Oct 2017 14:06:43 +0200 Subject: [PATCH 298/555] ALSA: usx2y: Suppress kernel warning at page allocation failures commit 7682e399485fe19622b6fd82510b1f4551e48a25 upstream. The usx2y driver allocates the stream read/write buffers in continuous pages depending on the stream setup, and this may spew the kernel warning messages with a stack trace like: WARNING: CPU: 1 PID: 1846 at mm/page_alloc.c:3883 __alloc_pages_slowpath+0x1ef2/0x2d70 Modules linked in: CPU: 1 PID: 1846 Comm: kworker/1:2 Not tainted .... It may confuse user as if it were any serious error, although this is no fatal error and the driver handles the error case gracefully. Since the driver has already some sanity check of the given size (128 and 256 pages), it can't pass any crazy value. So it's merely page fragmentation. This patch adds __GFP_NOWARN to each caller for suppressing such kernel warnings. The original issue was spotted by syzkaller. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/usx2y/usb_stream.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/usb/usx2y/usb_stream.c b/sound/usb/usx2y/usb_stream.c index bf618e1500ac..e7b934f4d837 100644 --- a/sound/usb/usx2y/usb_stream.c +++ b/sound/usb/usx2y/usb_stream.c @@ -191,7 +191,8 @@ struct usb_stream *usb_stream_new(struct usb_stream_kernel *sk, } pg = get_order(read_size); - sk->s = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO, pg); + sk->s = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO| + __GFP_NOWARN, pg); if (!sk->s) { snd_printk(KERN_WARNING "couldn't __get_free_pages()\n"); goto out; @@ -211,7 +212,8 @@ struct usb_stream *usb_stream_new(struct usb_stream_kernel *sk, pg = get_order(write_size); sk->write_page = - (void *)__get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO, pg); + (void *)__get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO| + __GFP_NOWARN, pg); if (!sk->write_page) { snd_printk(KERN_WARNING "couldn't __get_free_pages()\n"); usb_stream_free(sk); From f860ca549de4670e95516d6e7cbbc733a69c63db Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Tue, 12 Sep 2017 08:50:53 +0200 Subject: [PATCH 299/555] mlxsw: spectrum: Prevent mirred-related crash on removal [ Upstream commit 6399ebcccffa12e65bc15eda039d37673264ebce ] When removing the offloading of mirred actions under matchall classifiers, mlxsw would find the destination port associated with the offloaded action and utilize it for undoing the configuration. Depending on the order by which ports are removed, it's possible that the destination port would get removed before the source port. In such a scenario, when actions would be flushed for the source port mlxsw would perform an illegal dereference as the destination port is no longer listed. Since the only item necessary for undoing the configuration on the destination side is the port-id and that in turn is already maintained by mlxsw on the source-port, simply stop trying to access the destination port and use the port-id directly instead. Fixes: 763b4b70af ("mlxsw: spectrum: Add support in matchall mirror TC offloading") Signed-off-by: Yuval Mintz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- .../net/ethernet/mellanox/mlxsw/spectrum.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 1806b1fc6e4c..d50350c7adc4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -249,15 +249,14 @@ static void mlxsw_sp_span_entry_destroy(struct mlxsw_sp *mlxsw_sp, } static struct mlxsw_sp_span_entry * -mlxsw_sp_span_entry_find(struct mlxsw_sp_port *port) +mlxsw_sp_span_entry_find(struct mlxsw_sp *mlxsw_sp, u8 local_port) { - struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp; int i; for (i = 0; i < mlxsw_sp->span.entries_count; i++) { struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i]; - if (curr->used && curr->local_port == port->local_port) + if (curr->used && curr->local_port == local_port) return curr; } return NULL; @@ -268,7 +267,8 @@ static struct mlxsw_sp_span_entry { struct mlxsw_sp_span_entry *span_entry; - span_entry = mlxsw_sp_span_entry_find(port); + span_entry = mlxsw_sp_span_entry_find(port->mlxsw_sp, + port->local_port); if (span_entry) { /* Already exists, just take a reference */ span_entry->ref_count++; @@ -453,12 +453,13 @@ err_port_bind: } static void mlxsw_sp_span_mirror_remove(struct mlxsw_sp_port *from, - struct mlxsw_sp_port *to, + u8 destination_port, enum mlxsw_sp_span_type type) { struct mlxsw_sp_span_entry *span_entry; - span_entry = mlxsw_sp_span_entry_find(to); + span_entry = mlxsw_sp_span_entry_find(from->mlxsw_sp, + destination_port); if (!span_entry) { netdev_err(from->dev, "no span entry found\n"); return; @@ -1255,10 +1256,8 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, struct tc_cls_matchall_offload *cls) { - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry; enum mlxsw_sp_span_type span_type; - struct mlxsw_sp_port *to_port; mall_tc_entry = mlxsw_sp_port_mirror_entry_find(mlxsw_sp_port, cls->cookie); @@ -1269,11 +1268,12 @@ static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, switch (mall_tc_entry->type) { case MLXSW_SP_PORT_MALL_MIRROR: - to_port = mlxsw_sp->ports[mall_tc_entry->mirror.to_local_port]; span_type = mall_tc_entry->mirror.ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - mlxsw_sp_span_mirror_remove(mlxsw_sp_port, to_port, span_type); + mlxsw_sp_span_mirror_remove(mlxsw_sp_port, + mall_tc_entry->mirror.to_local_port, + span_type); break; default: WARN_ON(1); From f86d3b1a28a7271774fe4e8a9a028f6b209e1fd3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 13 Sep 2017 17:32:37 +0200 Subject: [PATCH 300/555] net: sched: fix use-after-free in tcf_action_destroy and tcf_del_walker [ Upstream commit 255cd50f207ae8ec7b22663246c833407744e634 ] Recent commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu") removed freeing in call_rcu, which changed already existing hard-to-hit race condition into 100% hit: [ 598.599825] BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 [ 598.607782] IP: tcf_action_destroy+0xc0/0x140 Or: [ 40.858924] BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 [ 40.862840] IP: tcf_generic_walker+0x534/0x820 Fix this by storing the ops and use them directly for module_put call. Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/act_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index c651cfce9be6..f3117324146a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -141,7 +141,7 @@ static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, hlist_for_each_entry_safe(p, n, head, tcfa_head) { ret = __tcf_hash_release(p, false, true); if (ret == ACT_P_DELETED) { - module_put(p->ops->owner); + module_put(ops->owner); n_i++; } else if (ret < 0) goto nla_put_failure; @@ -450,13 +450,15 @@ EXPORT_SYMBOL(tcf_action_exec); int tcf_action_destroy(struct list_head *actions, int bind) { + const struct tc_action_ops *ops; struct tc_action *a, *tmp; int ret = 0; list_for_each_entry_safe(a, tmp, actions, list) { + ops = a->ops; ret = __tcf_hash_release(a, bind, true); if (ret == ACT_P_DELETED) - module_put(a->ops->owner); + module_put(ops->owner); else if (ret < 0) return ret; } From b70bb9bb7277aa396eafab6a313ee829c7957587 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 14 Sep 2017 02:00:54 +0300 Subject: [PATCH 301/555] sctp: potential read out of bounds in sctp_ulpevent_type_enabled() [ Upstream commit fa5f7b51fc3080c2b195fa87c7eca7c05e56f673 ] This code causes a static checker warning because Smatch doesn't trust anything that comes from skb->data. I've reviewed this code and I do think skb->data can be controlled by the user here. The sctp_event_subscribe struct has 13 __u8 fields and we want to see if ours is non-zero. sn_type can be any value in the 0-USHRT_MAX range. We're subtracting SCTP_SN_TYPE_BASE which is 1 << 15 so we could read either before the start of the struct or after the end. This is a very old bug and it's surprising that it would go undetected for so long but my theory is that it just doesn't have a big impact so it would be hard to notice. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/sctp/ulpevent.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 2c098cd7e7e2..231df4fc8423 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -141,8 +141,12 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event); static inline int sctp_ulpevent_type_enabled(__u16 sn_type, struct sctp_event_subscribe *mask) { + int offset = sn_type - SCTP_SN_TYPE_BASE; char *amask = (char *) mask; - return amask[sn_type - SCTP_SN_TYPE_BASE]; + + if (offset >= sizeof(struct sctp_event_subscribe)) + return 0; + return amask[offset]; } /* Given an event subscription, is this event enabled? */ From 186a9c5e7038d7e6b39d2f6c96f71056b1bacd8d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Sep 2017 20:30:39 -0700 Subject: [PATCH 302/555] tcp: update skb->skb_mstamp more carefully [ Upstream commit 8c72c65b426b47b3c166a8fef0d8927fe5e8a28d ] liujian reported a problem in TCP_USER_TIMEOUT processing with a patch in tcp_probe_timer() : https://www.spinics.net/lists/netdev/msg454496.html After investigations, the root cause of the problem is that we update skb->skb_mstamp of skbs in write queue, even if the attempt to send a clone or copy of it failed. One reason being a routing problem. This patch prevents this, solving liujian issue. It also removes a potential RTT miscalculation, since __tcp_retransmit_skb() is not OR-ing TCP_SKB_CB(skb)->sacked with TCPCB_EVER_RETRANS if a failure happens, but skb->skb_mstamp has been changed. A future ACK would then lead to a very small RTT sample and min_rtt would then be lowered to this too small value. Tested: # cat user_timeout.pkt --local_ip=192.168.102.64 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 `ifconfig tun0 192.168.102.64/16; ip ro add 192.0.2.1 dev tun0` +0 < S 0:0(0) win 0 +0 > S. 0:0(0) ack 1 +.1 < . 1:1(0) ack 1 win 65530 +0 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0 +0 write(4, ..., 24) = 24 +0 > P. 1:25(24) ack 1 win 29200 +.1 < . 1:1(0) ack 25 win 65530 //change the ipaddress +1 `ifconfig tun0 192.168.0.10/16` +1 write(4, ..., 24) = 24 +1 write(4, ..., 24) = 24 +1 write(4, ..., 24) = 24 +1 write(4, ..., 24) = 24 +0 `ifconfig tun0 192.168.102.64/16` +0 < . 1:2(1) ack 25 win 65530 +0 `ifconfig tun0 192.168.0.10/16` +3 write(4, ..., 24) = -1 # ./packetdrill user_timeout.pkt Signed-off-by: Eric Dumazet Reported-by: liujian Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5d836b037442..bb2f00b5f200 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -914,6 +914,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; + struct sk_buff *oskb = NULL; struct tcp_md5sig_key *md5; struct tcphdr *th; int err; @@ -922,11 +923,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tp = tcp_sk(sk); if (clone_it) { - skb_mstamp_get(&skb->skb_mstamp); TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; tcp_rate_skb_sent(sk, skb); + oskb = skb; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else @@ -934,6 +935,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (unlikely(!skb)) return -ENOBUFS; } + skb_mstamp_get(&skb->skb_mstamp); inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); @@ -1035,12 +1037,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); - if (likely(err <= 0)) - return err; + if (unlikely(err > 0)) { + tcp_enter_cwr(sk); + err = net_xmit_eval(err); + } + if (!err && oskb) + skb_mstamp_get(&oskb->skb_mstamp); - tcp_enter_cwr(sk); - - return net_xmit_eval(err); + return err; } /* This routine just queues the buffer for sending. @@ -2709,10 +2713,11 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - skb_mstamp_get(&skb->skb_mstamp); nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; + if (!err) + skb_mstamp_get(&skb->skb_mstamp); } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } From e159492b3c3edeffb9cbfb449efb5e7adfb719f7 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 15 Sep 2017 14:37:38 +0100 Subject: [PATCH 303/555] bpf/verifier: reject BPF_ALU64|BPF_END [ Upstream commit e67b8a685c7c984e834e3181ef4619cd7025a136 ] Neither ___bpf_prog_run nor the JITs accept it. Also adds a new test case. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 779c871c5dcd..372454aa7f37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1720,7 +1720,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || - (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || + BPF_CLASS(insn->code) == BPF_ALU64) { verbose("BPF_END uses reserved fields\n"); return -EINVAL; } From 85908ccae5c2510aff7b9ef2e3e859ffa8f8824c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Sep 2017 16:47:42 -0700 Subject: [PATCH 304/555] tcp: fix data delivery rate [ Upstream commit fc22579917eb7e13433448a342f1cb1592920940 ] Now skb->mstamp_skb is updated later, we also need to call tcp_rate_skb_sent() after the update is done. Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bb2f00b5f200..b44abddf942c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -925,8 +925,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (clone_it) { TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; - tcp_rate_skb_sent(sk, skb); - oskb = skb; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -1041,9 +1039,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_enter_cwr(sk); err = net_xmit_eval(err); } - if (!err && oskb) + if (!err && oskb) { skb_mstamp_get(&oskb->skb_mstamp); - + tcp_rate_skb_sent(sk, oskb); + } return err; } From f0a5af78b530ab5616d6fcde79d28ad57ecaea3e Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 13 Sep 2017 19:30:51 -0600 Subject: [PATCH 305/555] udpv6: Fix the checksum computation when HW checksum does not apply [ Upstream commit 63ecc3d9436f8012e49dc846d6cb0a85a3433517 ] While trying an ESP transport mode encryption for UDPv6 packets of datagram size 1436 with MTU 1500, checksum error was observed in the secondary fragment. This error occurs due to the UDP payload checksum being missed out when computing the full checksum for these packets in udp6_hwcsum_outgoing(). Fixes: d39d938c8228 ("ipv6: Introduce udpv6_send_skb()") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/udp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2497f62fa4c2..4db5f541bca6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -915,6 +915,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; From e814bae39ad562608361b851ae79cc7edcc35bf7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 12:00:07 +0800 Subject: [PATCH 306/555] ip6_gre: skb_push ipv6hdr before packing the header in ip6gre_header [ Upstream commit 76cc0d3282d4b933fa144fa41fbc5318e0fdca24 ] Now in ip6gre_header before packing the ipv6 header, it skb_push t->hlen which only includes encap_hlen + tun_hlen. It means greh and inner header would be over written by ipv6 stuff and ipv6h might have no chance to set up. Jianlin found this issue when using remote any on ip6_gre, the packets he captured on gre dev are truncated: 22:50:26.210866 Out ethertype IPv6 (0x86dd), length 120: truncated-ip6 -\ 8128 bytes missing!(flowlabel 0x92f40, hlim 0, next-header Options (0) \ payload length: 8192) ::1:2000:0 > ::1:0:86dd: HBH [trunc] ip-proto-128 \ 8184 It should also skb_push ipv6hdr so that ipv6h points to the right position to set ipv6 stuff up. This patch is to skb_push hlen + sizeof(*ipv6h) and also fix some indents in ip6gre_header. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_gre.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f78afe43bdff..b19a0966a5fe 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -936,24 +936,25 @@ done: } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(ipv6h+1); + struct ipv6hdr *ipv6h; + __be16 *p; - ip6_flow_hdr(ipv6h, 0, - ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, true, - &t->fl.u.ip6)); + ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen + sizeof(*ipv6h)); + ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, + true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; - p[0] = t->parms.o_flags; - p[1] = htons(type); + p = (__be16 *)(ipv6h + 1); + p[0] = t->parms.o_flags; + p[1] = htons(type); /* * Set the source hardware address. From fc2fe7a06d6d3b6d9f3a0e1880b98fab0fa88341 Mon Sep 17 00:00:00 2001 From: Fahad Kunnathadi Date: Fri, 15 Sep 2017 12:01:58 +0530 Subject: [PATCH 307/555] net: phy: Fix mask value write on gmii2rgmii converter speed register [ Upstream commit f2654a4781318dc7ab8d6cde66f1fa39eab980a9 ] To clear Speed Selection in MDIO control register(0x10), ie, clear bits 6 and 13 to zero while keeping other bits same. Before AND operation,The Mask value has to be perform with bitwise NOT operation (ie, ~ operator) This patch clears current speed selection before writing the new speed settings to gmii2rgmii converter Fixes: f411a6160bd4 ("net: phy: Add gmiitorgmii converter support") Signed-off-by: Fahad Kunnathadi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/xilinx_gmii2rgmii.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c index d15dd3938ba8..2e5150b0b8d5 100644 --- a/drivers/net/phy/xilinx_gmii2rgmii.c +++ b/drivers/net/phy/xilinx_gmii2rgmii.c @@ -44,7 +44,7 @@ static int xgmiitorgmii_read_status(struct phy_device *phydev) priv->phy_drv->read_status(phydev); val = mdiobus_read(phydev->mdio.bus, priv->addr, XILINX_GMII2RGMII_REG); - val &= XILINX_GMII2RGMII_SPEED_MASK; + val &= ~XILINX_GMII2RGMII_SPEED_MASK; if (phydev->speed == SPEED_1000) val |= BMCR_SPEED1000; From 13c8bd7a21ed3612beb0f2ee02c84b49e883ea32 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 15:58:33 +0800 Subject: [PATCH 308/555] ip6_tunnel: do not allow loading ip6_tunnel if ipv6 is disabled in cmdline [ Upstream commit 8c22dab03ad072e45060c299c70d02a4f6fc4aab ] If ipv6 has been disabled from cmdline since kernel started, it makes no sense to allow users to create any ip6 tunnel. Otherwise, it could some potential problem. Jianlin found a kernel crash caused by this in ip6_gre when he set ipv6.disable=1 in grub: [ 209.588865] Unable to handle kernel paging request for data at address 0x00000080 [ 209.588872] Faulting instruction address: 0xc000000000a3aa6c [ 209.588879] Oops: Kernel access of bad area, sig: 11 [#1] [ 209.589062] NIP [c000000000a3aa6c] fib_rules_lookup+0x4c/0x260 [ 209.589071] LR [c000000000b9ad90] fib6_rule_lookup+0x50/0xb0 [ 209.589076] Call Trace: [ 209.589097] fib6_rule_lookup+0x50/0xb0 [ 209.589106] rt6_lookup+0xc4/0x110 [ 209.589116] ip6gre_tnl_link_config+0x214/0x2f0 [ip6_gre] [ 209.589125] ip6gre_newlink+0x138/0x3a0 [ip6_gre] [ 209.589134] rtnl_newlink+0x798/0xb80 [ 209.589142] rtnetlink_rcv_msg+0xec/0x390 [ 209.589151] netlink_rcv_skb+0x138/0x150 [ 209.589159] rtnetlink_rcv+0x48/0x70 [ 209.589169] netlink_unicast+0x538/0x640 [ 209.589175] netlink_sendmsg+0x40c/0x480 [ 209.589184] ___sys_sendmsg+0x384/0x4e0 [ 209.589194] SyS_sendmsg+0xd4/0x140 [ 209.589201] SyS_socketcall+0x3e0/0x4f0 [ 209.589209] system_call+0x38/0xe0 This patch is to return -EOPNOTSUPP in ip6_tunnel_init if ipv6 has been disabled from cmdline. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_tunnel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 1fc9daa7b1d6..a39f28ffbd63 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2231,6 +2231,9 @@ static int __init ip6_tunnel_init(void) { int err; + if (!ipv6_mod_enabled()) + return -EOPNOTSUPP; + err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; From b13bc543b1e64edd5786dda8be614303f55b68f1 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 16 Sep 2017 14:02:21 +0200 Subject: [PATCH 309/555] net/sched: cls_matchall: fix crash when used with classful qdisc [ Upstream commit 3ff4cbec87da48b0ec1f7b6196607b034de0c680 ] this script, edited from Linux Advanced Routing and Traffic Control guide tc q a dev en0 root handle 1: htb default a tc c a dev en0 parent 1: classid 1:1 htb rate 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:a htb rate 5mbit ceil 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:b htb rate 1mbit ceil 6mbit burst 15k tc f a dev en0 parent 1:0 prio 1 $clsname $clsargs classid 1:b ping $address -c1 tc -s c s dev en0 classifies traffic to 1:b or 1:a, depending on whether the packet matches or not the pattern $clsargs of filter $clsname. However, when $clsname is 'matchall', a systematic crash can be observed in htb_classify(). HTB and classful qdiscs don't assign initial value to struct tcf_result, but then they expect it to contain valid values after filters have been run. Thus, current 'matchall' ignores the TCA_MATCHALL_CLASSID attribute, configured by user, and makes HTB (and classful qdiscs) dereference random pointers. By assigning head->res to *res in mall_classify(), before the actions are invoked, we fix this crash and enable TCA_MATCHALL_CLASSID functionality, that had no effect on 'matchall' classifier since its first introduction. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1460213 Reported-by: Jiri Benc Fixes: b87f7936a932 ("net/sched: introduce Match-all classifier") Signed-off-by: Davide Caratti Acked-by: Yotam Gigi Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/cls_matchall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index b12bc2abea93..e75fb65037d7 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -32,6 +32,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (tc_skip_sw(head->flags)) return -1; + *res = head->res; return tcf_exts_exec(skb, &head->exts, res); } From b463521db854a0b73a14b34d782c51d6f7c87a77 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 10:05:57 -0700 Subject: [PATCH 310/555] tcp: fastopen: fix on syn-data transmit failure [ Upstream commit b5b7db8d680464b1d631fd016f5e093419f0bfd9 ] Our recent change exposed a bug in TCP Fastopen Client that syzkaller found right away [1] When we prepare skb with SYN+DATA, we attempt to transmit it, and we update socket state as if the transmit was a success. In socket RTX queue we have two skbs, one with the SYN alone, and a second one containing the DATA. When (malicious) ACK comes in, we now complain that second one had no skb_mstamp. The proper fix is to make sure that if the transmit failed, we do not pretend we sent the DATA skb, and make it our send_head. When 3WHS completes, we can now send the DATA right away, without having to wait for a timeout. [1] WARNING: CPU: 0 PID: 100189 at net/ipv4/tcp_input.c:3117 tcp_clean_rtx_queue+0x2057/0x2ab0 net/ipv4/tcp_input.c:3117() WARN_ON_ONCE(last_ackt == 0); Modules linked in: CPU: 0 PID: 100189 Comm: syz-executor1 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 0000000000000000 ffff8800b35cb1d8 ffffffff81cad00d 0000000000000000 ffffffff828a4347 ffff88009f86c080 ffffffff8316eb20 0000000000000d7f ffff8800b35cb220 ffffffff812c33c2 ffff8800baad2440 00000009d46575c0 Call Trace: [] __dump_stack [] dump_stack+0xc1/0x124 [] warn_slowpath_common+0xe2/0x150 [] warn_slowpath_null+0x2e/0x40 [] tcp_clean_rtx_queue+0x2057/0x2ab0 n [] tcp_ack+0x151d/0x3930 [] tcp_rcv_state_process+0x1c69/0x4fd0 [] tcp_v4_do_rcv+0x54f/0x7c0 [] sk_backlog_rcv [] __release_sock+0x12b/0x3a0 [] release_sock+0x5e/0x1c0 [] inet_wait_for_connect [] __inet_stream_connect+0x545/0xc50 [] tcp_sendmsg_fastopen [] tcp_sendmsg+0x2298/0x35a0 [] inet_sendmsg+0xe5/0x520 [] sock_sendmsg_nosec [] sock_sendmsg+0xcf/0x110 Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") Fixes: 783237e8daf1 ("net-tcp: Fast Open client - sending SYN-data") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b44abddf942c..85920707c4d3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3329,6 +3329,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto done; } + /* data was not sent, this is our new send_head */ + sk->sk_send_head = syn_data; + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3378,6 +3382,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ From 6eac2cd24bd97cb9eada99a01ee6022c573447de Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Tue, 19 Sep 2017 19:35:18 +0200 Subject: [PATCH 311/555] net: emac: Fix napi poll list corruption [ Upstream commit f55956065ec94e3e9371463d693a1029c4cc3007 ] This patch is pretty much a carbon copy of commit 3079c652141f ("caif: Fix napi poll list corruption") with "caif" replaced by "emac". The commit d75b1ade567f ("net: less interrupt masking in NAPI") breaks emac. It is now required that if the entire budget is consumed when poll returns, the napi poll_list must remain empty. However, like some other drivers emac tries to do a last-ditch check and if there is more work it will call napi_reschedule and then immediately process some of this new work. Should the entire budget be consumed while processing such new work then we will violate the new caller contract. This patch fixes this by not touching any work when we reschedule in emac. Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/ibm/emac/mal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index aaf6fec566b5..3660a3d51731 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -402,7 +402,7 @@ static int mal_poll(struct napi_struct *napi, int budget) unsigned long flags; MAL_DBG2(mal, "poll(%d)" NL, budget); - again: + /* Process TX skbs */ list_for_each(l, &mal->poll_list) { struct mal_commac *mc = @@ -451,7 +451,6 @@ static int mal_poll(struct napi_struct *napi, int budget) spin_lock_irqsave(&mal->lock, flags); mal_disable_eob_irq(mal); spin_unlock_irqrestore(&mal->lock, flags); - goto again; } mc->ops->poll_tx(mc->dev); } From 6f7cdd4aa0a45f21edf6cb31236cd9d10c0d7992 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 14 Sep 2017 17:14:41 -0400 Subject: [PATCH 312/555] packet: hold bind lock when rebinding to fanout hook [ Upstream commit 008ba2a13f2d04c947adc536d19debb8fe66f110 ] Packet socket bind operations must hold the po->bind_lock. This keeps po->running consistent with whether the socket is actually on a ptype list to receive packets. fanout_add unbinds a socket and its packet_rcv/tpacket_rcv call, then binds the fanout object to receive through packet_rcv_fanout. Make it hold the po->bind_lock when testing po->running and rebinding. Else, it can race with other rebind operations, such as that in packet_set_ring from packet_rcv to tpacket_rcv. Concurrent updates can result in a socket being added to a fanout group twice, causing use-after-free KASAN bug reports, among others. Reported independently by both trinity and syzkaller. Verified that the syzkaller reproducer passes after this patch. Fixes: dc99f600698d ("packet: Add fanout support.") Reported-by: nixioaming Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9c92c6cb6a4f..29d5fa508667 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1648,10 +1648,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) mutex_lock(&fanout_mutex); - err = -EINVAL; - if (!po->running) - goto out; - err = -EALREADY; if (po->fanout) goto out; @@ -1700,7 +1696,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) list_add(&match->list, &fanout_list); } err = -EINVAL; - if (match->type == type && + + spin_lock(&po->bind_lock); + if (po->running && + match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; @@ -1712,6 +1711,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = 0; } } + spin_unlock(&po->bind_lock); + + if (err && !refcount_read(&match->sk_ref)) { + list_del(&match->list); + kfree(match); + } + out: if (err && rollover) { kfree(rollover); From 0dee549f79121393e13efc0c2a05a98da00f3eda Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Sep 2017 16:38:36 -0700 Subject: [PATCH 313/555] bpf: one perf event close won't free bpf program attached by another perf event [ Upstream commit ec9dd352d591f0c90402ec67a317c1ed4fb2e638 ] This patch fixes a bug exhibited by the following scenario: 1. fd1 = perf_event_open with attr.config = ID1 2. attach bpf program prog1 to fd1 3. fd2 = perf_event_open with attr.config = ID1 4. user program closes fd2 and prog1 is detached from the tracepoint. 5. user program with fd1 does not work properly as tracepoint no output any more. The issue happens at step 4. Multiple perf_event_open can be called successfully, but only one bpf prog pointer in the tp_event. In the current logic, any fd release for the same tp_event will free the tp_event->prog. The fix is to free tp_event->prog only when the closing fd corresponds to the one which registered the program. Signed-off-by: Yonghong Song Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/trace_events.h | 1 + kernel/events/core.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index be007610ceb0..ba57266d9e80 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,6 +273,7 @@ struct trace_event_call { int perf_refcount; struct hlist_head __percpu *perf_events; struct bpf_prog *prog; + struct perf_event *bpf_prog_owner; int (*perf_perm)(struct trace_event_call *, struct perf_event *); diff --git a/kernel/events/core.c b/kernel/events/core.c index c774773ac3a4..36ff2d93f222 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7871,6 +7871,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -7885,7 +7886,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put(prog); } From 93eef2172d2352d6cff07008e689cc56d841f35a Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 21:49:55 -0400 Subject: [PATCH 314/555] isdn/i4l: fetch the ppp_write buffer in one shot [ Upstream commit 02388bf87f72e1d47174cd8f81c34443920eb5a0 ] In isdn_ppp_write(), the header (i.e., protobuf) of the buffer is fetched twice from userspace. The first fetch is used to peek at the protocol of the message and reset the huptimer if necessary; while the second fetch copies in the whole buffer. However, given that buf resides in userspace memory, a user process can race to change its memory content across fetches. By doing so, we can either avoid resetting the huptimer for any type of packets (by first setting proto to PPP_LCP and later change to the actual type) or force resetting the huptimer for LCP packets. This patch changes this double-fetch behavior into two single fetches decided by condition (lp->isdn_device < 0 || lp->isdn_channel <0). A more detailed discussion can be found at https://marc.info/?l=linux-kernel&m=150586376926123&w=2 Signed-off-by: Meng Xu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/isdn/i4l/isdn_ppp.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c index bf3fbd00a091..64b586458d3d 100644 --- a/drivers/isdn/i4l/isdn_ppp.c +++ b/drivers/isdn/i4l/isdn_ppp.c @@ -828,7 +828,6 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) isdn_net_local *lp; struct ippp_struct *is; int proto; - unsigned char protobuf[4]; is = file->private_data; @@ -842,24 +841,28 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) if (!lp) printk(KERN_DEBUG "isdn_ppp_write: lp == NULL\n"); else { - /* - * Don't reset huptimer for - * LCP packets. (Echo requests). - */ - if (copy_from_user(protobuf, buf, 4)) - return -EFAULT; - proto = PPP_PROTOCOL(protobuf); - if (proto != PPP_LCP) - lp->huptimer = 0; + if (lp->isdn_device < 0 || lp->isdn_channel < 0) { + unsigned char protobuf[4]; + /* + * Don't reset huptimer for + * LCP packets. (Echo requests). + */ + if (copy_from_user(protobuf, buf, 4)) + return -EFAULT; + + proto = PPP_PROTOCOL(protobuf); + if (proto != PPP_LCP) + lp->huptimer = 0; - if (lp->isdn_device < 0 || lp->isdn_channel < 0) return 0; + } if ((dev->drv[lp->isdn_device]->flags & DRV_FLAG_RUNNING) && lp->dialstate == 0 && (lp->flags & ISDN_NET_CONNECTED)) { unsigned short hl; struct sk_buff *skb; + unsigned char *cpy_buf; /* * we need to reserve enough space in front of * sk_buff. old call to dev_alloc_skb only reserved @@ -872,11 +875,21 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) return count; } skb_reserve(skb, hl); - if (copy_from_user(skb_put(skb, count), buf, count)) + cpy_buf = skb_put(skb, count); + if (copy_from_user(cpy_buf, buf, count)) { kfree_skb(skb); return -EFAULT; } + + /* + * Don't reset huptimer for + * LCP packets. (Echo requests). + */ + proto = PPP_PROTOCOL(cpy_buf); + if (proto != PPP_LCP) + lp->huptimer = 0; + if (is->debug & 0x40) { printk(KERN_DEBUG "ppp xmit: len %d\n", (int) skb->len); isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot); From 5600c7586ad9281dd2bf78cf46f8ad4353c75c9e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:45:36 +0300 Subject: [PATCH 315/555] net_sched: always reset qdisc backlog in qdisc_reset() [ Upstream commit c8e1812960eeae42e2183154927028511c4bc566 ] SKB stored in qdisc->gso_skb also counted into backlog. Some qdiscs don't reset backlog to zero in ->reset(), for example sfq just dequeue and free all queued skb. Signed-off-by: Konstantin Khlebnikov Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6cfb6e9038c2..9016c8baf2aa 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -681,6 +681,7 @@ void qdisc_reset(struct Qdisc *qdisc) qdisc->gso_skb = NULL; } qdisc->q.qlen = 0; + qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); From 852bdea5e379df029d19d7f9ae8b6d337bf0fc9d Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Fri, 22 Sep 2017 15:32:44 -0500 Subject: [PATCH 316/555] net: qcom/emac: specify the correct size when mapping a DMA buffer [ Upstream commit a93ad944f4ff9a797abff17c73fc4b1e4a1d9141 ] When mapping the RX DMA buffers, the driver was accidentally specifying zero for the buffer length. Under normal circumstances, SWIOTLB does not need to allocate a bounce buffer, so the address is just mapped without checking the size field. This is why the error was not detected earlier. Fixes: b9b17debc69d ("net: emac: emac gigabit ethernet controller driver") Cc: stable@vger.kernel.org Signed-off-by: Timur Tabi Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qualcomm/emac/emac-mac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index 0b4deb31e742..f683bfbd9986 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -932,7 +932,8 @@ static void emac_mac_rx_descs_refill(struct emac_adapter *adpt, curr_rxbuf->dma_addr = dma_map_single(adpt->netdev->dev.parent, skb->data, - curr_rxbuf->length, DMA_FROM_DEVICE); + adpt->rxbuf_size, DMA_FROM_DEVICE); + ret = dma_mapping_error(adpt->netdev->dev.parent, curr_rxbuf->dma_addr); if (ret) { From 6689f8358681b375130867088ed86e7a3fccbdc1 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 26 Sep 2017 15:14:29 +0300 Subject: [PATCH 317/555] vti: fix use after free in vti_tunnel_xmit/vti6_tnl_xmit [ Upstream commit 36f6ee22d2d66046e369757ec6bbe1c482957ba6 ] When running LTP IPsec tests, KASan might report: BUG: KASAN: use-after-free in vti_tunnel_xmit+0xeee/0xff0 [ip_vti] Read of size 4 at addr ffff880dc6ad1980 by task swapper/0/0 ... Call Trace: dump_stack+0x63/0x89 print_address_description+0x7c/0x290 kasan_report+0x28d/0x370 ? vti_tunnel_xmit+0xeee/0xff0 [ip_vti] __asan_report_load4_noabort+0x19/0x20 vti_tunnel_xmit+0xeee/0xff0 [ip_vti] ? vti_init_net+0x190/0x190 [ip_vti] ? save_stack_trace+0x1b/0x20 ? save_stack+0x46/0xd0 dev_hard_start_xmit+0x147/0x510 ? icmp_echo.part.24+0x1f0/0x210 __dev_queue_xmit+0x1394/0x1c60 ... Freed by task 0: save_stack_trace+0x1b/0x20 save_stack+0x46/0xd0 kasan_slab_free+0x70/0xc0 kmem_cache_free+0x81/0x1e0 kfree_skbmem+0xb1/0xe0 kfree_skb+0x75/0x170 kfree_skb_list+0x3e/0x60 __dev_queue_xmit+0x1298/0x1c60 dev_queue_xmit+0x10/0x20 neigh_resolve_output+0x3a8/0x740 ip_finish_output2+0x5c0/0xe70 ip_finish_output+0x4ba/0x680 ip_output+0x1c1/0x3a0 xfrm_output_resume+0xc65/0x13d0 xfrm_output+0x1e4/0x380 xfrm4_output_finish+0x5c/0x70 Can be fixed if we get skb->len before dst_output(). Fixes: b9959fd3b0fa ("vti: switch to new ip tunnel code") Fixes: 22e1b23dafa8 ("vti6: Support inter address family tunneling.") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_vti.c | 3 ++- net/ipv6/ip6_vti.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5d7944f394d9..b120b9b11402 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 66c2b4b41793..816f79d1a8a3 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -498,7 +499,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { From e5941137f784ddfbbb14da152a4f8a930cb228f8 Mon Sep 17 00:00:00 2001 From: Ridge Kennedy Date: Wed, 22 Feb 2017 14:59:49 +1300 Subject: [PATCH 318/555] l2tp: Avoid schedule while atomic in exit_net [ Upstream commit 12d656af4e3d2781b9b9f52538593e1717e7c979 ] While destroying a network namespace that contains a L2TP tunnel a "BUG: scheduling while atomic" can be observed. Enabling lockdep shows that this is happening because l2tp_exit_net() is calling l2tp_tunnel_closeall() (via l2tp_tunnel_delete()) from within an RCU critical section. l2tp_exit_net() takes rcu_read_lock_bh() << list_for_each_entry_rcu() >> l2tp_tunnel_delete() l2tp_tunnel_closeall() __l2tp_session_unhash() synchronize_rcu() << Illegal inside RCU critical section >> BUG: sleeping function called from invalid context in_atomic(): 1, irqs_disabled(): 0, pid: 86, name: kworker/u16:2 INFO: lockdep is turned off. CPU: 2 PID: 86 Comm: kworker/u16:2 Tainted: G W O 4.4.6-at1 #2 Hardware name: Xen HVM domU, BIOS 4.6.1-xs125300 05/09/2016 Workqueue: netns cleanup_net 0000000000000000 ffff880202417b90 ffffffff812b0013 ffff880202410ac0 ffffffff81870de8 ffff880202417bb8 ffffffff8107aee8 ffffffff81870de8 0000000000000c51 0000000000000000 ffff880202417be0 ffffffff8107b024 Call Trace: [] dump_stack+0x85/0xc2 [] ___might_sleep+0x148/0x240 [] __might_sleep+0x44/0x80 [] synchronize_sched+0x2d/0xe0 [] ? trace_hardirqs_on+0xd/0x10 [] ? __local_bh_enable_ip+0x6b/0xc0 [] ? _raw_spin_unlock_bh+0x30/0x40 [] __l2tp_session_unhash+0x172/0x220 [] ? __l2tp_session_unhash+0x87/0x220 [] l2tp_tunnel_closeall+0x9b/0x140 [] l2tp_tunnel_delete+0x14/0x60 [] l2tp_exit_net+0x110/0x270 [] ? l2tp_exit_net+0x9c/0x270 [] ops_exit_list.isra.6+0x33/0x60 [] cleanup_net+0x1b6/0x280 ... This bug can easily be reproduced with a few steps: $ sudo unshare -n bash # Create a shell in a new namespace # ip link set lo up # ip addr add 127.0.0.1 dev lo # ip l2tp add tunnel remote 127.0.0.1 local 127.0.0.1 tunnel_id 1 \ peer_tunnel_id 1 udp_sport 50000 udp_dport 50000 # ip l2tp add session name foo tunnel_id 1 session_id 1 \ peer_session_id 1 # ip link set foo up # exit # Exit the shell, in turn exiting the namespace $ dmesg ... [942121.089216] BUG: scheduling while atomic: kworker/u16:3/13872/0x00000200 ... To fix this, move the call to l2tp_tunnel_closeall() out of the RCU critical section, and instead call it from l2tp_tunnel_del_work(), which is running from the l2tp_wq workqueue. Fixes: 2b551c6e7d5b ("l2tp: close sessions before initiating tunnel delete") Signed-off-by: Ridge Kennedy Acked-by: Guillaume Nault Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/l2tp/l2tp_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 3bce65183c95..3415e20f0065 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1415,6 +1415,9 @@ static void l2tp_tunnel_del_work(struct work_struct *work) struct sock *sk = NULL; tunnel = container_of(work, struct l2tp_tunnel, del_work); + + l2tp_tunnel_closeall(tunnel); + sk = l2tp_tunnel_sock_lookup(tunnel); if (!sk) goto out; @@ -1737,7 +1740,6 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { l2tp_tunnel_inc_refcount(tunnel); - l2tp_tunnel_closeall(tunnel); if (false == queue_work(l2tp_wq, &tunnel->del_work)) { l2tp_tunnel_dec_refcount(tunnel); return 1; From b4a9b12d9a2c9bfae8eba546b9e29aea3bd69f9e Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Sep 2017 16:16:43 +0200 Subject: [PATCH 319/555] l2tp: fix race condition in l2tp_tunnel_delete [ Upstream commit 62b982eeb4589b2e6d7c01a90590e3a4c2b2ca19 ] If we try to delete the same tunnel twice, the first delete operation does a lookup (l2tp_tunnel_get), finds the tunnel, calls l2tp_tunnel_delete, which queues it for deletion by l2tp_tunnel_del_work. The second delete operation also finds the tunnel and calls l2tp_tunnel_delete. If the workqueue has already fired and started running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the same tunnel a second time, and try to free the socket again. Add a dead flag to prevent firing the workqueue twice. Then we can remove the check of queue_work's result that was meant to prevent that race but doesn't. Reproducer: ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000 ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 peer_session_id 2000 ip link set l2tp1 up ip l2tp del tunnel tunnel_id 3000 ip l2tp del tunnel tunnel_id 3000 Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue") Reported-by: Jianlin Shi Signed-off-by: Sabrina Dubroca Acked-by: Guillaume Nault Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/l2tp/l2tp_core.c | 10 ++++------ net/l2tp/l2tp_core.h | 5 ++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 3415e20f0065..b06acd0f400d 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1737,14 +1737,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); /* This function is used by the netlink TUNNEL_DELETE command. */ -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - l2tp_tunnel_inc_refcount(tunnel); - if (false == queue_work(l2tp_wq, &tunnel->del_work)) { - l2tp_tunnel_dec_refcount(tunnel); - return 1; + if (!test_and_set_bit(0, &tunnel->dead)) { + l2tp_tunnel_inc_refcount(tunnel); + queue_work(l2tp_wq, &tunnel->del_work); } - return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 0095012509ac..42419f1c24cf 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -169,6 +169,9 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + + unsigned long dead; + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; @@ -257,7 +260,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, From b8990d2e77c6652c9318adc55f698ee4de5e3ee1 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 28 Sep 2017 11:32:37 +0200 Subject: [PATCH 320/555] tun: bail out from tun_get_user() if the skb is empty [ Upstream commit 2580c4c17aee3ad58e9751012bad278dd074ccae ] KMSAN (https://github.com/google/kmsan) reported accessing uninitialized skb->data[0] in the case the skb is empty (i.e. skb->len is 0): ================================================ BUG: KMSAN: use of uninitialized memory in tun_get_user+0x19ba/0x3770 CPU: 0 PID: 3051 Comm: probe Not tainted 4.13.0+ #3140 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: ... __msan_warning_32+0x66/0xb0 mm/kmsan/kmsan_instr.c:477 tun_get_user+0x19ba/0x3770 drivers/net/tun.c:1301 tun_chr_write_iter+0x19f/0x300 drivers/net/tun.c:1365 call_write_iter ./include/linux/fs.h:1743 new_sync_write fs/read_write.c:457 __vfs_write+0x6c3/0x7f0 fs/read_write.c:470 vfs_write+0x3e4/0x770 fs/read_write.c:518 SYSC_write+0x12f/0x2b0 fs/read_write.c:565 SyS_write+0x55/0x80 fs/read_write.c:557 do_syscall_64+0x242/0x330 arch/x86/entry/common.c:284 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:245 ... origin: ... kmsan_poison_shadow+0x6e/0xc0 mm/kmsan/kmsan.c:211 slab_alloc_node mm/slub.c:2732 __kmalloc_node_track_caller+0x351/0x370 mm/slub.c:4351 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x26a/0x810 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:903 alloc_skb_with_frags+0x1d7/0xc80 net/core/skbuff.c:4756 sock_alloc_send_pskb+0xabf/0xfe0 net/core/sock.c:2037 tun_alloc_skb drivers/net/tun.c:1144 tun_get_user+0x9a8/0x3770 drivers/net/tun.c:1274 tun_chr_write_iter+0x19f/0x300 drivers/net/tun.c:1365 call_write_iter ./include/linux/fs.h:1743 new_sync_write fs/read_write.c:457 __vfs_write+0x6c3/0x7f0 fs/read_write.c:470 vfs_write+0x3e4/0x770 fs/read_write.c:518 SYSC_write+0x12f/0x2b0 fs/read_write.c:565 SyS_write+0x55/0x80 fs/read_write.c:557 do_syscall_64+0x242/0x330 arch/x86/entry/common.c:284 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:245 ================================================ Make sure tun_get_user() doesn't touch skb->data[0] unless there is actual data. C reproducer below: ========================== // autogenerated by syzkaller (http://github.com/google/syzkaller) #define _GNU_SOURCE #include #include #include #include #include #include int main() { int sock = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); int tun_fd = open("/dev/net/tun", O_RDWR); struct ifreq req; memset(&req, 0, sizeof(struct ifreq)); strcpy((char*)&req.ifr_name, "gre0"); req.ifr_flags = IFF_UP | IFF_MULTICAST; ioctl(tun_fd, TUNSETIFF, &req); ioctl(sock, SIOCSIFFLAGS, "gre0"); write(tun_fd, "hi", 0); return 0; } ========================== Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/tun.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a931b73393c8..ba7f9e054c4a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1279,11 +1279,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { - switch (skb->data[0] & 0xf0) { - case 0x40: + u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; + + switch (ip_version) { + case 4: pi.proto = htons(ETH_P_IP); break; - case 0x60: + case 6: pi.proto = htons(ETH_P_IPV6); break; default: From 6eab1f829417973122f235af002499d786a22023 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 25 Sep 2017 15:55:53 -0700 Subject: [PATCH 321/555] net: dsa: Fix network device registration order [ Upstream commit e804441cfe0b60f6c430901946a69c01eac09df1 ] We cannot be registering the network device first, then setting its carrier off and finally connecting it to a PHY, doing that leaves a window during which the carrier is at best inconsistent, and at worse the device is not usable without a down/up sequence since the network device is visible to user space with possibly no PHY device attached. Re-order steps so that they make logical sense. This fixes some devices where the port was not usable after e.g: an unbind then bind of the driver. Fixes: 0071f56e46da ("dsa: Register netdev before phy") Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dsa/slave.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 079d76bc204c..5000e6f20f4a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1269,26 +1269,32 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, p->old_duplex = -1; ds->ports[port].netdev = slave_dev; - ret = register_netdev(slave_dev); - if (ret) { - netdev_err(master, "error %d registering interface %s\n", - ret, slave_dev->name); - ds->ports[port].netdev = NULL; - free_netdev(slave_dev); - return ret; - } netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - unregister_netdev(slave_dev); - free_netdev(slave_dev); - return ret; + goto out_free; + } + + ret = register_netdev(slave_dev); + if (ret) { + netdev_err(master, "error %d registering interface %s\n", + ret, slave_dev->name); + goto out_phy; } return 0; + +out_phy: + phy_disconnect(p->phy); + if (of_phy_is_fixed_link(ds->ports[port].dn)) + of_phy_deregister_fixed_link(ds->ports[port].dn); +out_free: + free_netdev(slave_dev); + ds->ports[port].netdev = NULL; + return ret; } void dsa_slave_destroy(struct net_device *slave_dev) From 0f22167d3321a028c0b6edc2d5b2ab0e37a2ac53 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:19:37 -0400 Subject: [PATCH 322/555] packet: in packet_do_bind, test fanout with bind_lock held [ Upstream commit 4971613c1639d8e5f102c4e797c3bf8f83a5a69e ] Once a socket has po->fanout set, it remains a member of the group until it is destroyed. The prot_hook must be constant and identical across sockets in the group. If fanout_add races with packet_do_bind between the test of po->fanout and taking the lock, the bind call may make type or dev inconsistent with that of the fanout group. Hold po->bind_lock when testing po->fanout to avoid this race. I had to introduce artificial delay (local_bh_enable) to actually observe the race. Fixes: dc99f600698d ("packet: Add fanout support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 29d5fa508667..0856c125b06d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3069,13 +3069,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, int ret = 0; bool unlisted = false; - if (po->fanout) - return -EINVAL; - lock_sock(sk); spin_lock(&po->bind_lock); rcu_read_lock(); + if (po->fanout) { + ret = -EINVAL; + goto out_unlock; + } + if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { From 24ee394a82d2cd92c92b617713a4d263131a902b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:20:17 -0400 Subject: [PATCH 323/555] packet: only test po->has_vnet_hdr once in packet_snd [ Upstream commit da7c9561015e93d10fe6aab73e9288e0d09d65a6 ] Packet socket option po->has_vnet_hdr can be updated concurrently with other operations if no ring is attached. Do not test the option twice in packet_snd, as the value may change in between calls. A race on setsockopt disable may cause a packet > mtu to be sent without having GSO options set. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0856c125b06d..b17f9097c6fe 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1713,7 +1713,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } spin_unlock(&po->bind_lock); - if (err && !refcount_read(&match->sk_ref)) { + if (err && !atomic_read(&match->sk_ref)) { list_del(&match->list); kfree(match); } @@ -2838,6 +2838,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); + bool has_vnet_hdr = false; int hlen, tlen, linear; int extra_len = 0; @@ -2881,6 +2882,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; + has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2941,7 +2943,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) packet_pick_tx_queue(dev, skb); - if (po->has_vnet_hdr) { + if (has_vnet_hdr) { err = packet_snd_vnet_gso(skb, &vnet_hdr); if (err) goto out_free; From cf2eaf16ab284e3c5b057dff4c68516cfeae62ba Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 26 Sep 2017 17:38:50 -0700 Subject: [PATCH 324/555] net: Set sk_prot_creator when cloning sockets to the right proto [ Upstream commit 9d538fa60bad4f7b23193c89e843797a1cf71ef3 ] sk->sk_prot and sk->sk_prot_creator can differ when the app uses IPV6_ADDRFORM (transforming an IPv6-socket to an IPv4-one). Which is why sk_prot_creator is there to make sure that sk_prot_free() does the kmem_cache_free() on the right kmem_cache slab. Now, if such a socket gets transformed back to a listening socket (using connect() with AF_UNSPEC) we will allocate an IPv4 tcp_sock through sk_clone_lock() when a new connection comes in. But sk_prot_creator will still point to the IPv6 kmem_cache (as everything got copied in sk_clone_lock()). When freeing, we will thus put this memory back into the IPv6 kmem_cache although it was allocated in the IPv4 cache. I have seen memory corruption happening because of this. With slub-debugging and MEMCG_KMEM enabled this gives the warning "cache_from_obj: Wrong slab cache. TCPv6 but object is from TCP" A C-program to trigger this: void main(void) { int fd = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); int new_fd, newest_fd, client_fd; struct sockaddr_in6 bind_addr; struct sockaddr_in bind_addr4, client_addr1, client_addr2; struct sockaddr unsp; int val; memset(&bind_addr, 0, sizeof(bind_addr)); bind_addr.sin6_family = AF_INET6; bind_addr.sin6_port = ntohs(42424); memset(&client_addr1, 0, sizeof(client_addr1)); client_addr1.sin_family = AF_INET; client_addr1.sin_port = ntohs(42424); client_addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&client_addr2, 0, sizeof(client_addr2)); client_addr2.sin_family = AF_INET; client_addr2.sin_port = ntohs(42421); client_addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&unsp, 0, sizeof(unsp)); unsp.sa_family = AF_UNSPEC; bind(fd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)); listen(fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr1, sizeof(client_addr1)); new_fd = accept(fd, NULL, NULL); close(fd); val = AF_INET; setsockopt(new_fd, SOL_IPV6, IPV6_ADDRFORM, &val, sizeof(val)); connect(new_fd, &unsp, sizeof(unsp)); memset(&bind_addr4, 0, sizeof(bind_addr4)); bind_addr4.sin_family = AF_INET; bind_addr4.sin_port = ntohs(42421); bind(new_fd, (struct sockaddr *)&bind_addr4, sizeof(bind_addr4)); listen(new_fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr2, sizeof(client_addr2)); newest_fd = accept(new_fd, NULL, NULL); close(new_fd); close(client_fd); close(new_fd); } As far as I can see, this bug has been there since the beginning of the git-days. Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index 1989b3dd6d17..231c38d91855 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1493,6 +1493,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); + newsk->sk_prot_creator = sk->sk_prot; + /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); From b4a119251f6b29fd06153a3e241fe2b85e9fb159 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 28 Sep 2017 00:41:44 +0200 Subject: [PATCH 325/555] netlink: do not proceed if dump's start() errs [ Upstream commit fef0035c0f31322d417d1954bba5ab959bf91183 ] Drivers that use the start method for netlink dumping rely on dumpit not being called if start fails. For example, ila_xlat.c allocates memory and assigns it to cb->args[0] in its start() function. It might fail to do that and return -ENOMEM instead. However, even when returning an error, dumpit will be called, which, in the example above, quickly dereferences the memory in cb->args[0], which will OOPS the kernel. This is but one example of how this goes wrong. Since start() has always been a function with an int return type, it therefore makes sense to use it properly, rather than ignoring it. This patch thus returns early and does not call dumpit() when start() fails. Signed-off-by: Jason A. Donenfeld Cc: Johannes Berg Reviewed-by: Johannes Berg Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netlink/af_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 246f29d365c0..2a5775f8a6ca 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2211,10 +2211,13 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, mutex_unlock(nlk->cb_mutex); + ret = 0; if (cb->start) - cb->start(cb); + ret = cb->start(cb); + + if (!ret) + ret = netlink_dump(sk); - ret = netlink_dump(sk); sock_put(sk); if (ret) From ab4da56f61bed798a833b8fd9cf64d88b1ba27a3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:50 +0800 Subject: [PATCH 326/555] ip6_gre: ip6gre_tap device should keep dst [ Upstream commit 2d40557cc702ed8e5edd9bd422233f86652d932e ] The patch 'ip_gre: ipgre_tap device should keep dst' fixed a issue that ipgre_tap mtu couldn't be updated in tx path. The same fix is needed for ip6gre_tap as well. Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b19a0966a5fe..41c10486cf7e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1298,6 +1298,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], From 09788d46b756a71313378b56d1a927a5ee64f7ce Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:24:07 +0800 Subject: [PATCH 327/555] ip6_tunnel: update mtu properly for ARPHRD_ETHER tunnel device in tx path [ Upstream commit d41bb33ba33b8f8debe54ed36be6925eb496e354 ] Now when updating mtu in tx path, it doesn't consider ARPHRD_ETHER tunnel device, like ip6gre_tap tunnel, for which it should also subtract ether header to get the correct mtu. Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_tunnel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a39f28ffbd63..12b2fd512f32 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1042,6 +1042,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1120,7 +1121,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1129,7 +1130,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; From 58b1b8407a3121b7dc8b8612f7802647833b5069 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Fri, 29 Sep 2017 10:02:54 +0200 Subject: [PATCH 328/555] tipc: use only positive error codes in messages [ Upstream commit aad06212d36cf34859428a0a279e5c14ee5c9e26 ] In commit e3a77561e7d32 ("tipc: split up function tipc_msg_eval()"), we have updated the function tipc_msg_lookup_dest() to set the error codes to negative values at destination lookup failures. Thus when the function sets the error code to -TIPC_ERR_NO_NAME, its inserted into the 4 bit error field of the message header as 0xf instead of TIPC_ERR_NO_NAME (1). The value 0xf is an unknown error code. In this commit, we set only positive error code. Fixes: e3a77561e7d32 ("tipc: split up function tipc_msg_eval()") Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/msg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 56ea0adcd285..912f1fb97c06 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -547,7 +547,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return false; if (msg_errcode(msg)) return false; - *err = -TIPC_ERR_NO_NAME; + *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); From 95206ea376b9ed43ff7cac7f944f654b4314f754 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 3 Oct 2017 13:20:48 +0300 Subject: [PATCH 329/555] net: rtnetlink: fix info leak in RTM_GETSTATS call [ Upstream commit ce024f42c2e28b6bce4ecc1e891b42f57f753892 ] When RTM_GETSTATS was added the fields of its header struct were not all initialized when returning the result thus leaking 4 bytes of information to user-space per rtnl_fill_statsinfo call, so initialize them now. Thanks to Alexander Potapenko for the detailed report and bisection. Reported-by: Alexander Potapenko Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump link stats") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4d2629781e8b..c2339b865164 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3758,6 +3758,9 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, return -EMSGSIZE; ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; From 02f7e4101092b88e57c73171174976c8a72a3eba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Oct 2017 12:20:51 -0700 Subject: [PATCH 330/555] socket, bpf: fix possible use after free [ Upstream commit eefca20eb20c66b06cf5ed09b49b1a7caaa27b7b ] Starting from linux-4.4, 3WHS no longer takes the listener lock. Since this time, we might hit a use-after-free in sk_filter_charge(), if the filter we got in the memcpy() of the listener content just happened to be replaced by a thread changing listener BPF filter. To fix this, we need to make sure the filter refcount is not already zero before incrementing it again. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 15 +++++++++++++-- net/core/sock.c | 5 ++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 4eb4ce0aeef4..bfeedbbde214 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -937,20 +937,31 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) /* try to charge the socket memory if there is space available * return true on success */ -bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= sysctl_optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { - atomic_inc(&fp->refcnt); atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } +bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + if (!atomic_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; +} + static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; diff --git a/net/core/sock.c b/net/core/sock.c index 231c38d91855..2a77cc50f021 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1528,13 +1528,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); - filter = rcu_dereference_protected(newsk->sk_filter, 1); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new From afebf5ef60da6d15e75398e41ea2817c7a2bb283 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Thu, 17 Aug 2017 20:42:26 +1000 Subject: [PATCH 331/555] powerpc/64s: Use emergency stack for kernel TM Bad Thing program checks commit 265e60a170d0a0ecfc2d20490134ed2c48dd45ab upstream. When using transactional memory (TM), the CPU can be in one of six states as far as TM is concerned, encoded in the Machine State Register (MSR). Certain state transitions are illegal and if attempted trigger a "TM Bad Thing" type program check exception. If we ever hit one of these exceptions it's treated as a bug, ie. we oops, and kill the process and/or panic, depending on configuration. One case where we can trigger a TM Bad Thing, is when returning to userspace after a system call or interrupt, using RFID. When this happens the CPU first restores the user register state, in particular r1 (the stack pointer) and then attempts to update the MSR. However the MSR update is not allowed and so we take the program check with the user register state, but the kernel MSR. This tricks the exception entry code into thinking we have a bad kernel stack pointer, because the MSR says we're coming from the kernel, but r1 is pointing to userspace. To avoid this we instead always switch to the emergency stack if we take a TM Bad Thing from the kernel. That way none of the user register values are used, other than for printing in the oops message. This is the fix for CVE-2017-1000255. Fixes: 5d176f751ee3 ("powerpc: tm: Enable transactional memory (TM) lazily for userspace") Signed-off-by: Cyril Bur [mpe: Rewrite change log & comments, tweak asm slightly] Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/exceptions-64s.S | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2e2fc1e37715..fd68e19b9ef7 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -764,7 +764,29 @@ EXC_REAL(program_check, 0x700, 0x800) EXC_VIRT(program_check, 0x4700, 0x4800, 0x700) TRAMP_KVM(PACA_EXGEN, 0x700) EXC_COMMON_BEGIN(program_check_common) - EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) + /* + * It's possible to receive a TM Bad Thing type program check with + * userspace register values (in particular r1), but with SRR1 reporting + * that we came from the kernel. Normally that would confuse the bad + * stack logic, and we would report a bad kernel stack pointer. Instead + * we switch to the emergency stack if we're taking a TM Bad Thing from + * the kernel. + */ + li r10,MSR_PR /* Build a mask of MSR_PR .. */ + oris r10,r10,0x200000@h /* .. and SRR1_PROGTM */ + and r10,r10,r12 /* Mask SRR1 with that. */ + srdi r10,r10,8 /* Shift it so we can compare */ + cmpldi r10,(0x200000 >> 8) /* .. with an immediate. */ + bne 1f /* If != go to normal path. */ + + /* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack */ + andi. r10,r12,MSR_PR; /* Set CR0 correctly for label */ + /* 3 in EXCEPTION_PROLOG_COMMON */ + mr r10,r1 /* Save r1 */ + ld r1,PACAEMERGSP(r13) /* Use emergency stack */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + b 3f /* Jump into the macro !! */ +1: EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD From 6a988259b1cbbde82d4e2d844b376a81e78c7a32 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Tue, 22 Aug 2017 17:20:09 -0400 Subject: [PATCH 332/555] powerpc/tm: Fix illegal TM state in signal handler commit 044215d145a7a8a60ffa8fdc859d110a795fa6ea upstream. Currently it's possible that on returning from the signal handler through the restore_tm_sigcontexts() code path (e.g. from a signal caught due to a `trap` instruction executed in the middle of an HTM block, or a deliberately constructed sigframe) an illegal TM state (like TS=10 TM=0, i.e. "T0") is set in SRR1 and when `rfid` sets implicitly the MSR register from SRR1 register on return to userspace it causes a TM Bad Thing exception. That illegal state can be set (a) by a malicious user that disables the TM bit by tweaking the bits in uc_mcontext before returning from the signal handler or (b) by a sufficient number of context switches occurring such that the load_tm counter overflows and TM is disabled whilst in the signal handler. This commit fixes the illegal TM state by ensuring that TM bit is always enabled before we return from restore_tm_sigcontexts(). A small comment correction is made as well. Fixes: 5d176f751ee3 ("powerpc: tm: Enable transactional memory (TM) lazily for userspace") Signed-off-by: Gustavo Romero Signed-off-by: Breno Leitao Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/signal_64.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 96698fdf93b4..04e92257fd69 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -452,9 +452,20 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, if (MSR_TM_RESV(msr)) return -EINVAL; - /* pull in MSR TM from user context */ + /* pull in MSR TS bits from user context */ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + /* + * Ensure that TM is enabled in regs->msr before we leave the signal + * handler. It could be the case that (a) user disabled the TM bit + * through the manipulation of the MSR bits in uc_mcontext or (b) the + * TM bit was disabled because a sufficient number of context switches + * happened whilst in the signal handler and load_tm overflowed, + * disabling the TM bit. In either case we can end up with an illegal + * TM state leading to a TM Bad Thing when we return to userspace. + */ + regs->msr |= MSR_TM; + /* pull in MSR LE from user context */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); From fc3c67226acd0df7ae248749e13248477b483bf0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 26 Sep 2017 12:41:52 +0100 Subject: [PATCH 333/555] percpu: make this_cpu_generic_read() atomic w.r.t. interrupts commit e88d62cd4b2f0b1ae55e9008e79c2794b1fc914d upstream. As raw_cpu_generic_read() is a plain read from a raw_cpu_ptr() address, it's possible (albeit unlikely) that the compiler will split the access across multiple instructions. In this_cpu_generic_read() we disable preemption but not interrupts before calling raw_cpu_generic_read(). Thus, an interrupt could be taken in the middle of the split load instructions. If a this_cpu_write() or RMW this_cpu_*() op is made to the same variable in the interrupt handling path, this_cpu_read() will return a torn value. For native word types, we can avoid tearing using READ_ONCE(), but this won't work in all cases (e.g. 64-bit types on most 32-bit platforms). This patch reworks this_cpu_generic_read() to use READ_ONCE() where possible, otherwise falling back to disabling interrupts. Signed-off-by: Mark Rutland Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Pranith Kumar Cc: Tejun Heo Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/percpu.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 0504ef8f3aa3..976f8ac26665 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -115,15 +115,35 @@ do { \ (__ret); \ }) -#define this_cpu_generic_read(pcp) \ +#define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ typeof(pcp) __ret; \ preempt_disable_notrace(); \ - __ret = raw_cpu_generic_read(pcp); \ + __ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ __ret; \ }) +#define __this_cpu_generic_read_noirq(pcp) \ +({ \ + typeof(pcp) __ret; \ + unsigned long __flags; \ + raw_local_irq_save(__flags); \ + __ret = raw_cpu_generic_read(pcp); \ + raw_local_irq_restore(__flags); \ + __ret; \ +}) + +#define this_cpu_generic_read(pcp) \ +({ \ + typeof(pcp) __ret; \ + if (__native_word(pcp)) \ + __ret = __this_cpu_generic_read_nopreempt(pcp); \ + else \ + __ret = __this_cpu_generic_read_noirq(pcp); \ + __ret; \ +}) + #define this_cpu_generic_to_op(pcp, val, op) \ do { \ unsigned long __flags; \ From a97ca4f7801815753423985b7aa20cd661b7049f Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:42 +0200 Subject: [PATCH 334/555] driver core: platform: Don't read past the end of "driver_override" buffer commit bf563b01c2895a4bfd1a29cc5abc67fe706ecffd upstream. When printing the driver_override parameter when it is 4095 and 4094 bytes long, the printing code would access invalid memory because we need count+1 bytes for printing. Reject driver_override values of these lengths in driver_override_store(). This is in close analogy to commit 4efe874aace5 ("PCI: Don't read past the end of sysfs "driver_override" buffer") from Sasha Levin. Fixes: 3d713e0e382e ("driver core: platform: add device binding path 'driver_override'") Signed-off-by: Nicolai Stange Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 5eba47815bb6..14ff40371f01 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -858,7 +858,8 @@ static ssize_t driver_override_store(struct device *dev, struct platform_device *pdev = to_platform_device(dev); char *driver_override, *old, *cp; - if (count > PATH_MAX) + /* We need to keep extra room for a newline */ + if (count >= (PAGE_SIZE - 1)) return -EINVAL; driver_override = kstrndup(buf, count, GFP_KERNEL); From c541aaad4ac7830c4ee3d14ed984c423b9e7e479 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Thu, 21 Sep 2017 23:41:48 -0700 Subject: [PATCH 335/555] Drivers: hv: fcopy: restore correct transfer length commit 549e658a0919e355a2b2144dc380b3729bef7f3e upstream. Till recently the expected length of bytes read by the daemon did depend on the context. It was either hv_start_fcopy or hv_do_fcopy. The daemon had a buffer size of two pages, which was much larger than needed. Now the expected length of bytes read by the daemon changed slightly. For START_FILE_COPY it is still the size of hv_start_fcopy. But for WRITE_TO_FILE and the other operations it is as large as the buffer that arrived via vmbus. In case of WRITE_TO_FILE that is slightly larger than a struct hv_do_fcopy. Since the buffer in the daemon was still larger everything was fine. Currently, the daemon reads only what is actually needed. The new buffer layout is as large as a struct hv_do_fcopy, for the WRITE_TO_FILE operation. Since the kernel expects a slightly larger size, hvt_op_read will return -EINVAL because the daemon will read slightly less than expected. Address this by restoring the expected buffer size in case of WRITE_TO_FILE. Fixes: 'c7e490fc23eb ("Drivers: hv: fcopy: convert to hv_utils_transport")' Fixes: '3f2baa8a7d2e ("Tools: hv: update buffer handling in hv_fcopy_daemon")' Signed-off-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_fcopy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index e47d8c9db03a..75126e4e3f05 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -161,6 +161,10 @@ static void fcopy_send_data(struct work_struct *dummy) out_src = smsg_out; break; + case WRITE_TO_FILE: + out_src = fcopy_transaction.fcopy_msg; + out_len = sizeof(struct hv_do_fcopy); + break; default: out_src = fcopy_transaction.fcopy_msg; out_len = fcopy_transaction.recv_len; From 3ff8bc813b1301ee1395b4629b5136559e3fc6ba Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Sep 2017 18:47:40 +0300 Subject: [PATCH 336/555] stm class: Fix a use-after-free commit fd085bb1766d6a598f53af2308374a546a49775a upstream. For reasons unknown, the stm_source removal path uses device_destroy() to kill the underlying device object. Because device_destroy() uses devt to look for the device to destroy and the fact that stm_source devices don't have one (or all have the same one), it just picks the first device in the class, which may well be the wrong one. That is, loading stm_console and stm_heartbeat and then removing both will die in dereferencing a freed object. Since this should have been device_unregister() in the first place, use it instead of device_destroy(). Signed-off-by: Alexander Shishkin Fixes: 7bd1d4093c2 ("stm class: Introduce an abstraction for System Trace Module devices") Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/stm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index a6ea387b5b00..877a0ed76abf 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -1119,7 +1119,7 @@ void stm_source_unregister_device(struct stm_source_data *data) stm_source_link_drop(src); - device_destroy(&stm_source_class, src->dev.devt); + device_unregister(&src->dev); } EXPORT_SYMBOL_GPL(stm_source_unregister_device); From a3ec104976f799808c2c1d8b32005c67b0037adb Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 12 Sep 2017 10:14:54 +0800 Subject: [PATCH 337/555] ftrace: Fix kmemleak in unregister_ftrace_graph commit 2b0b8499ae75df91455bbeb7491d45affc384fb0 upstream. The trampoline allocated by function tracer was overwriten by function_graph tracer, and caused a memory leak. The save_global_trampoline should have saved the previous trampoline in register_ftrace_graph() and restored it in unregister_ftrace_graph(). But as it is implemented, save_global_trampoline was only used in unregister_ftrace_graph as default value 0, and it overwrote the previous trampoline's value. Causing the previous allocated trampoline to be lost. kmmeleak backtrace: kmemleak_vmalloc+0x77/0xc0 __vmalloc_node_range+0x1b5/0x2c0 module_alloc+0x7c/0xd0 arch_ftrace_update_trampoline+0xb5/0x290 ftrace_startup+0x78/0x210 register_ftrace_function+0x8b/0xd0 function_trace_init+0x4f/0x80 tracing_set_tracer+0xe6/0x170 tracing_set_trace_write+0x90/0xd0 __vfs_write+0x37/0x170 vfs_write+0xb2/0x1b0 SyS_write+0x55/0xc0 do_syscall_64+0x67/0x180 return_from_SYSCALL_64+0x0/0x6a [ Looking further into this, I found that this was left over from when the function and function graph tracers shared the same ftrace_ops. But in commit 5f151b2401 ("ftrace: Fix function_profiler and function tracer together"), the two were separated, and the save_global_trampoline no longer was necessary (and it may have been broken back then too). -- Steven Rostedt ] Link: http://lkml.kernel.org/r/20170912021454.5976-1-shuwang@redhat.com Fixes: 5f151b2401 ("ftrace: Fix function_profiler and function tracer together") Signed-off-by: Shu Wang Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ftrace.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53ed8ae5de1c..5b8d7189e147 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4381,9 +4381,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -5981,17 +5978,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } From 5abb9cd4ff92c03410679842ba0cf9be4162873b Mon Sep 17 00:00:00 2001 From: Adrian Salido Date: Fri, 8 Sep 2017 10:55:27 -0700 Subject: [PATCH 338/555] HID: i2c-hid: allocate hid buffers for real worst case commit 8320caeeffdefec3b58b9d4a7ed8e1079492fe7b upstream. The buffer allocation is not currently accounting for an extra byte for the report id. This can cause an out of bounds access in function i2c_hid_set_or_send_report() with reportID > 15. Signed-off-by: Adrian Salido Reviewed-by: Benson Leung Signed-off-by: Guenter Roeck Signed-off-by: Dmitry Torokhov Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/i2c-hid/i2c-hid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index 8008e06b7efe..865e7c262322 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -604,7 +604,8 @@ static int i2c_hid_alloc_buffers(struct i2c_hid *ihid, size_t report_size) { /* the worst case is computed from the set_report command with a * reportID > 15 and the maximum report length */ - int args_len = sizeof(__u8) + /* optional ReportID byte */ + int args_len = sizeof(__u8) + /* ReportID */ + sizeof(__u8) + /* optional ReportID byte */ sizeof(__u16) + /* data register */ sizeof(__u16) + /* size of the report */ report_size; /* report */ From 04b54e8ff7d01b6e877f9a7915bc30be67dd5c39 Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Mon, 28 Aug 2017 14:15:39 -0700 Subject: [PATCH 339/555] HID: wacom: leds: Don't try to control the EKR's read-only LEDs commit 74aebed6dc13425233f2224668353cff7a112776 upstream. Commit a50aac7193f1 introduces 'led.groups' and adds EKR support for these groups. However, unlike the other devices with LEDs, the EKR's LEDs are read-only and we shouldn't attempt to control them in wacom_led_control(). See bug: https://sourceforge.net/p/linuxwacom/bugs/342/ Fixes: a50aac7193f1 ("HID: wacom: leds: dynamically allocate LED groups") Signed-off-by: Aaron Armstrong Skomra Reviewed-by: Jason Gerecke Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom_sys.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 53ac19b3727a..0224df2991af 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -712,6 +712,9 @@ static int wacom_led_control(struct wacom *wacom) if (!wacom->led.groups) return -ENOTSUPP; + if (wacom->wacom_wac.features.type == REMOTE) + return -ENOTSUPP; + if (wacom->wacom_wac.pid) { /* wireless connected */ report_id = WAC_CMD_WL_LED_CONTROL; buf_size = 13; From 953f5e7c6216d26d064baa3e0c5ed081a4ca552d Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Mon, 18 Sep 2017 09:27:42 -0700 Subject: [PATCH 340/555] HID: wacom: Always increment hdev refcount within wacom_get_hdev_data commit 2a5e597c6bb1b873e473e5f57147e9e5d2755430 upstream. The wacom_get_hdev_data function is used to find and return a reference to the "other half" of a Wacom device (i.e., the touch device associated with a pen, or vice-versa). To ensure these references are properly accounted for, the function is supposed to automatically increment the refcount before returning. This was not done, however, for devices which have pen & touch on different interfaces of the same USB device. This can lead to a WARNING ("refcount_t: underflow; use-after-free") when removing the module or device as we call kref_put() more times than kref_get(). Triggering an "actual" use- after-free would be difficult since both devices will disappear nearly- simultaneously. To silence this warning and prevent the potential error, we need to increment the refcount for all cases within wacom_get_hdev_data. Fixes: 41372d5d40 ("HID: wacom: Augment 'oVid' and 'oPid' with heuristics for HID_GENERIC") Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom_sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 0224df2991af..d72dfb2bbdb8 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -611,8 +611,10 @@ static struct wacom_hdev_data *wacom_get_hdev_data(struct hid_device *hdev) /* Try to find an already-probed interface from the same device */ list_for_each_entry(data, &wacom_udev_list, list) { - if (compare_device_paths(hdev, data->dev, '/')) + if (compare_device_paths(hdev, data->dev, '/')) { + kref_get(&data->kref); return data; + } } /* Fallback to finding devices that appear to be "siblings" */ From 50b27486ae8a3b3d9dddbfeaa6d6be843d414de4 Mon Sep 17 00:00:00 2001 From: Ping Cheng Date: Thu, 31 Aug 2017 15:50:03 -0700 Subject: [PATCH 341/555] HID: wacom: bits shifted too much for 9th and 10th buttons commit ce06760ba46b66dae50f2519ae76bd15e89b5710 upstream. Cintiq 12 has 10 expresskey buttons. The bit shift for the last two buttons were off by 5. Fixes: c7f0522 ("HID: wacom: Slim down wacom_intuos_pad processing") Signed-off-by: Ping Cheng Tested-by: Matthieu Robin Signed-off-by: Jiri Kosina Cc: Jason Gerecke Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom_wac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index c6a922ee5d3b..db951c4fd6dd 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -559,8 +559,8 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) keys = data[9] & 0x07; } } else { - buttons = ((data[6] & 0x10) << 10) | - ((data[5] & 0x10) << 9) | + buttons = ((data[6] & 0x10) << 5) | + ((data[5] & 0x10) << 4) | ((data[6] & 0x0F) << 4) | (data[5] & 0x0F); } From 57a77fffb0ffa8d9a56536ab8f6c1028abe83137 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Sep 2017 23:29:18 +0200 Subject: [PATCH 342/555] rocker: fix rocker_tlv_put_* functions for KASAN commit 6098d7ddd62f532f80ee2a4b01aca500a8e4e9e4 upstream. Inlining these functions creates lots of stack variables that each take 64 bytes when KASAN is enabled, leading to this warning about potential stack overflow: drivers/net/ethernet/rocker/rocker_ofdpa.c: In function 'ofdpa_cmd_flow_tbl_add': drivers/net/ethernet/rocker/rocker_ofdpa.c:621:1: error: the frame size of 2752 bytes is larger than 1536 bytes [-Werror=frame-larger-than=] gcc-8 can now consolidate the stack slots itself, but on older versions we get the same behavior by using a temporary variable that holds a copy of the inline function argument. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/rocker/rocker_tlv.h | 48 +++++++++++++++--------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker_tlv.h b/drivers/net/ethernet/rocker/rocker_tlv.h index a63ef82e7c72..dfae3c9d57c6 100644 --- a/drivers/net/ethernet/rocker/rocker_tlv.h +++ b/drivers/net/ethernet/rocker/rocker_tlv.h @@ -139,40 +139,52 @@ rocker_tlv_start(struct rocker_desc_info *desc_info) int rocker_tlv_put(struct rocker_desc_info *desc_info, int attrtype, int attrlen, const void *data); -static inline int rocker_tlv_put_u8(struct rocker_desc_info *desc_info, - int attrtype, u8 value) +static inline int +rocker_tlv_put_u8(struct rocker_desc_info *desc_info, int attrtype, u8 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u8), &value); + u8 tmp = value; /* work around GCC PR81715 */ + + return rocker_tlv_put(desc_info, attrtype, sizeof(u8), &tmp); } -static inline int rocker_tlv_put_u16(struct rocker_desc_info *desc_info, - int attrtype, u16 value) +static inline int +rocker_tlv_put_u16(struct rocker_desc_info *desc_info, int attrtype, u16 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u16), &value); + u16 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u16), &tmp); } -static inline int rocker_tlv_put_be16(struct rocker_desc_info *desc_info, - int attrtype, __be16 value) +static inline int +rocker_tlv_put_be16(struct rocker_desc_info *desc_info, int attrtype, __be16 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(__be16), &value); + __be16 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(__be16), &tmp); } -static inline int rocker_tlv_put_u32(struct rocker_desc_info *desc_info, - int attrtype, u32 value) +static inline int +rocker_tlv_put_u32(struct rocker_desc_info *desc_info, int attrtype, u32 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u32), &value); + u32 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u32), &tmp); } -static inline int rocker_tlv_put_be32(struct rocker_desc_info *desc_info, - int attrtype, __be32 value) +static inline int +rocker_tlv_put_be32(struct rocker_desc_info *desc_info, int attrtype, __be32 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(__be32), &value); + __be32 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(__be32), &tmp); } -static inline int rocker_tlv_put_u64(struct rocker_desc_info *desc_info, - int attrtype, u64 value) +static inline int +rocker_tlv_put_u64(struct rocker_desc_info *desc_info, int attrtype, u64 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u64), &value); + u64 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u64), &tmp); } static inline struct rocker_tlv * From 9a19bc44c63696db85309148609a963970ad9cc9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Sep 2017 23:29:19 +0200 Subject: [PATCH 343/555] netlink: fix nla_put_{u8,u16,u32} for KASAN commit b4391db42308c9940944b5d7be5ca4b78fb88dd0 upstream. When CONFIG_KASAN is enabled, the "--param asan-stack=1" causes rather large stack frames in some functions. This goes unnoticed normally because CONFIG_FRAME_WARN is disabled with CONFIG_KASAN by default as of commit 3f181b4d8652 ("lib/Kconfig.debug: disable -Wframe-larger-than warnings with KASAN=y"). The kernelci.org build bot however has the warning enabled and that led me to investigate it a little further, as every build produces these warnings: net/wireless/nl80211.c:4389:1: warning: the frame size of 2240 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/wireless/nl80211.c:1895:1: warning: the frame size of 3776 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/wireless/nl80211.c:1410:1: warning: the frame size of 2208 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/bridge/br_netlink.c:1282:1: warning: the frame size of 2544 bytes is larger than 2048 bytes [-Wframe-larger-than=] Most of this problem is now solved in gcc-8, which can consolidate the stack slots for the inline function arguments. On older compilers we can add a workaround by declaring a local variable in each function to pass the inline function argument. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/netlink.h | 73 ++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 18 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index 254a0fc01800..42adccd6739d 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -756,7 +756,10 @@ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, */ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) { - return nla_put(skb, attrtype, sizeof(u8), &value); + /* temporary variables to work around GCC PR81715 with asan-stack=1 */ + u8 tmp = value; + + return nla_put(skb, attrtype, sizeof(u8), &tmp); } /** @@ -767,7 +770,9 @@ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) */ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) { - return nla_put(skb, attrtype, sizeof(u16), &value); + u16 tmp = value; + + return nla_put(skb, attrtype, sizeof(u16), &tmp); } /** @@ -778,7 +783,9 @@ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) */ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) { - return nla_put(skb, attrtype, sizeof(__be16), &value); + __be16 tmp = value; + + return nla_put(skb, attrtype, sizeof(__be16), &tmp); } /** @@ -789,7 +796,9 @@ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) */ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) { - return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, value); + __be16 tmp = value; + + return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** @@ -800,7 +809,9 @@ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) */ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) { - return nla_put(skb, attrtype, sizeof(__le16), &value); + __le16 tmp = value; + + return nla_put(skb, attrtype, sizeof(__le16), &tmp); } /** @@ -811,7 +822,9 @@ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) */ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) { - return nla_put(skb, attrtype, sizeof(u32), &value); + u32 tmp = value; + + return nla_put(skb, attrtype, sizeof(u32), &tmp); } /** @@ -822,7 +835,9 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) */ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) { - return nla_put(skb, attrtype, sizeof(__be32), &value); + __be32 tmp = value; + + return nla_put(skb, attrtype, sizeof(__be32), &tmp); } /** @@ -833,7 +848,9 @@ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) */ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) { - return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, value); + __be32 tmp = value; + + return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** @@ -844,7 +861,9 @@ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) */ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) { - return nla_put(skb, attrtype, sizeof(__le32), &value); + __le32 tmp = value; + + return nla_put(skb, attrtype, sizeof(__le32), &tmp); } /** @@ -857,7 +876,9 @@ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, u64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(u64), &value, padattr); + u64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** @@ -870,7 +891,9 @@ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(__be64), &value, padattr); + __be64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr); } /** @@ -883,7 +906,9 @@ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { - return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, value, + __be64 tmp = value; + + return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp, padattr); } @@ -897,7 +922,9 @@ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(__le64), &value, padattr); + __le64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr); } /** @@ -908,7 +935,9 @@ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, */ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) { - return nla_put(skb, attrtype, sizeof(s8), &value); + s8 tmp = value; + + return nla_put(skb, attrtype, sizeof(s8), &tmp); } /** @@ -919,7 +948,9 @@ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) */ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) { - return nla_put(skb, attrtype, sizeof(s16), &value); + s16 tmp = value; + + return nla_put(skb, attrtype, sizeof(s16), &tmp); } /** @@ -930,7 +961,9 @@ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) */ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) { - return nla_put(skb, attrtype, sizeof(s32), &value); + s32 tmp = value; + + return nla_put(skb, attrtype, sizeof(s32), &tmp); } /** @@ -943,7 +976,9 @@ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(s64), &value, padattr); + s64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } /** @@ -993,7 +1028,9 @@ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, __be32 addr) { - return nla_put_be32(skb, attrtype, addr); + __be32 tmp = addr; + + return nla_put_be32(skb, attrtype, tmp); } /** From f8895642cf8ec03af47bd7017ecf0b8ff28fe23e Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 1 Sep 2017 17:59:15 +0300 Subject: [PATCH 344/555] iwlwifi: mvm: use IWL_HCMD_NOCOPY for MCAST_FILTER_CMD commit 97bce57bd7f96e1218751996f549a6e61f18cc8c upstream. The MCAST_FILTER_CMD can get quite large when we have many mcast addresses to set (we support up to 255). So the command should be send as NOCOPY to prevent a warning caused by too-long commands: WARNING: CPU: 0 PID: 9700 at /root/iwlwifi/stack-dev/drivers/net/wireless/intel/iwlwifi/pcie/tx.c:1550 iwl_pcie_enqueue_hcmd+0x8c7/0xb40 [iwlwifi] Command MCAST_FILTER_CMD (0x1d0) is too large (328 bytes) This fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196743 Signed-off-by: Luca Coelho Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 1db1dc13e988..9789f3c5a785 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1548,6 +1548,11 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac, struct iwl_mvm_mc_iter_data *data = _data; struct iwl_mvm *mvm = data->mvm; struct iwl_mcast_filter_cmd *cmd = mvm->mcast_filter_cmd; + struct iwl_host_cmd hcmd = { + .id = MCAST_FILTER_CMD, + .flags = CMD_ASYNC, + .dataflags[0] = IWL_HCMD_DFL_NOCOPY, + }; int ret, len; /* if we don't have free ports, mcast frames will be dropped */ @@ -1562,7 +1567,10 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac, memcpy(cmd->bssid, vif->bss_conf.bssid, ETH_ALEN); len = roundup(sizeof(*cmd) + cmd->count * ETH_ALEN, 4); - ret = iwl_mvm_send_cmd_pdu(mvm, MCAST_FILTER_CMD, CMD_ASYNC, len, cmd); + hcmd.len[0] = len; + hcmd.data[0] = cmd; + + ret = iwl_mvm_send_cmd(mvm, &hcmd); if (ret) IWL_ERR(mvm, "mcast filter cmd error. ret=%d\n", ret); } From aee20f321daf95f47183d4af8b0c3dcd0096fe25 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Tue, 15 Aug 2017 20:48:41 +0300 Subject: [PATCH 345/555] iwlwifi: add workaround to disable wide channels in 5GHz commit 01a9c948a09348950515bf2abb6113ed83e696d8 upstream. The OTP in some SKUs have erroneously allowed 40MHz and 80MHz channels in the 5.2GHz band. The firmware has been modified to not allow this in those SKUs, so the driver needs to do the same otherwise the firmware will assert when we try to use it. Signed-off-by: Luca Coelho Signed-off-by: Greg Kroah-Hartman --- .../wireless/intel/iwlwifi/iwl-nvm-parse.c | 66 +++++++++++++++---- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index 3bd6fc1b76d4..33f4d7c7b53a 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -78,6 +78,7 @@ /* NVM offsets (in words) definitions */ enum wkp_nvm_offsets { /* NVM HW-Section offset (in words) definitions */ + SUBSYSTEM_ID = 0x0A, HW_ADDR = 0x15, /* NVM SW-Section offset (in words) definitions */ @@ -262,13 +263,12 @@ static u32 iwl_get_channel_flags(u8 ch_num, int ch_idx, bool is_5ghz, static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg, struct iwl_nvm_data *data, const __le16 * const nvm_ch_flags, - bool lar_supported) + bool lar_supported, bool no_wide_in_5ghz) { int ch_idx; int n_channels = 0; struct ieee80211_channel *channel; u16 ch_flags; - bool is_5ghz; int num_of_ch, num_2ghz_channels; const u8 *nvm_chan; @@ -283,12 +283,20 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg, } for (ch_idx = 0; ch_idx < num_of_ch; ch_idx++) { + bool is_5ghz = (ch_idx >= num_2ghz_channels); + ch_flags = __le16_to_cpup(nvm_ch_flags + ch_idx); - if (ch_idx >= num_2ghz_channels && - !data->sku_cap_band_52GHz_enable) + if (is_5ghz && !data->sku_cap_band_52GHz_enable) continue; + /* workaround to disable wide channels in 5GHz */ + if (no_wide_in_5ghz && is_5ghz) { + ch_flags &= ~(NVM_CHANNEL_40MHZ | + NVM_CHANNEL_80MHZ | + NVM_CHANNEL_160MHZ); + } + if (ch_flags & NVM_CHANNEL_160MHZ) data->vht160_supported = true; @@ -311,8 +319,8 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg, n_channels++; channel->hw_value = nvm_chan[ch_idx]; - channel->band = (ch_idx < num_2ghz_channels) ? - NL80211_BAND_2GHZ : NL80211_BAND_5GHZ; + channel->band = is_5ghz ? + NL80211_BAND_5GHZ : NL80211_BAND_2GHZ; channel->center_freq = ieee80211_channel_to_frequency( channel->hw_value, channel->band); @@ -324,7 +332,6 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg, * is not used in mvm, and is used for backwards compatibility */ channel->max_power = IWL_DEFAULT_MAX_TX_POWER; - is_5ghz = channel->band == NL80211_BAND_5GHZ; /* don't put limitations in case we're using LAR */ if (!lar_supported) @@ -441,7 +448,8 @@ static void iwl_init_vht_hw_capab(const struct iwl_cfg *cfg, static void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg, struct iwl_nvm_data *data, const __le16 *ch_section, - u8 tx_chains, u8 rx_chains, bool lar_supported) + u8 tx_chains, u8 rx_chains, bool lar_supported, + bool no_wide_in_5ghz) { int n_channels; int n_used = 0; @@ -450,12 +458,14 @@ static void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg, if (cfg->device_family != IWL_DEVICE_FAMILY_8000) n_channels = iwl_init_channel_map( dev, cfg, data, - &ch_section[NVM_CHANNELS], lar_supported); + &ch_section[NVM_CHANNELS], lar_supported, + no_wide_in_5ghz); else n_channels = iwl_init_channel_map( dev, cfg, data, &ch_section[NVM_CHANNELS_FAMILY_8000], - lar_supported); + lar_supported, + no_wide_in_5ghz); sband = &data->bands[NL80211_BAND_2GHZ]; sband->band = NL80211_BAND_2GHZ; @@ -658,6 +668,39 @@ static int iwl_set_hw_address(struct iwl_trans *trans, return 0; } +static bool +iwl_nvm_no_wide_in_5ghz(struct device *dev, const struct iwl_cfg *cfg, + const __le16 *nvm_hw) +{ + /* + * Workaround a bug in Indonesia SKUs where the regulatory in + * some 7000-family OTPs erroneously allow wide channels in + * 5GHz. To check for Indonesia, we take the SKU value from + * bits 1-4 in the subsystem ID and check if it is either 5 or + * 9. In those cases, we need to force-disable wide channels + * in 5GHz otherwise the FW will throw a sysassert when we try + * to use them. + */ + if (cfg->device_family == IWL_DEVICE_FAMILY_7000) { + /* + * Unlike the other sections in the NVM, the hw + * section uses big-endian. + */ + u16 subsystem_id = be16_to_cpup((const __be16 *)nvm_hw + + SUBSYSTEM_ID); + u8 sku = (subsystem_id & 0x1e) >> 1; + + if (sku == 5 || sku == 9) { + IWL_DEBUG_EEPROM(dev, + "disabling wide channels in 5GHz (0x%0x %d)\n", + subsystem_id, sku); + return true; + } + } + + return false; +} + struct iwl_nvm_data * iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg, const __le16 *nvm_hw, const __le16 *nvm_sw, @@ -668,6 +711,7 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg, struct device *dev = trans->dev; struct iwl_nvm_data *data; bool lar_enabled; + bool no_wide_in_5ghz = iwl_nvm_no_wide_in_5ghz(dev, cfg, nvm_hw); u32 sku, radio_cfg; u16 lar_config; const __le16 *ch_section; @@ -738,7 +782,7 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg, } iwl_init_sbands(dev, cfg, data, ch_section, tx_chains, rx_chains, - lar_fw_supported && lar_enabled); + lar_fw_supported && lar_enabled, no_wide_in_5ghz); data->calib_version = 255; return data; From 12b182a35f459efdadeca230a2d365d938c5c510 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 27 Sep 2017 21:38:59 -0400 Subject: [PATCH 346/555] scsi: sd: Do not override max_sectors_kb sysfs setting commit 77082ca503bed061f7fbda7cfd7c93beda967a41 upstream. A user may lower the max_sectors_kb setting in sysfs to accommodate certain workloads. Previously we would always set the max I/O size to either the block layer default or the optional preferred I/O size reported by the device. Keep the current heuristics for the initial setting of max_sectors_kb. For subsequent invocations, only update the current queue limit if it exceeds the capabilities of the hardware. Reported-by: Don Brace Reviewed-by: Martin Wilck Tested-by: Don Brace Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sd.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 13ac7e57a35d..09fa1fd0c4ce 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2867,8 +2867,6 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_write_same(sdkp, buffer); } - sdkp->first_scan = 0; - /* * We now have all cache related info, determine how we deal * with flush requests. @@ -2883,7 +2881,7 @@ static int sd_revalidate_disk(struct gendisk *disk) q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); /* - * Use the device's preferred I/O size for reads and writes + * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, or * garbage. */ @@ -2897,8 +2895,19 @@ static int sd_revalidate_disk(struct gendisk *disk) rw_max = min_not_zero(logical_to_sectors(sdp, dev_max), (sector_t)BLK_DEF_MAX_SECTORS); - /* Combine with controller limits */ - q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q)); + /* Do not exceed controller limit */ + rw_max = min(rw_max, queue_max_hw_sectors(q)); + + /* + * Only update max_sectors if previously unset or if the current value + * exceeds the capabilities of the hardware. + */ + if (sdkp->first_scan || + q->limits.max_sectors > q->limits.max_dev_sectors || + q->limits.max_sectors > q->limits.max_hw_sectors) + q->limits.max_sectors = rw_max; + + sdkp->first_scan = 0; set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp); From 4d3132d97aa753104ee35722352a895750a0fca5 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 12 Sep 2017 10:47:53 +0200 Subject: [PATCH 347/555] brcmfmac: add length check in brcmf_cfg80211_escan_handler() commit 17df6453d4be17910456e99c5a85025aa1b7a246 upstream. Upon handling the firmware notification for scans the length was checked properly and may result in corrupting kernel heap memory due to buffer overruns. This fix addresses CVE-2017-0786. Cc: Kevin Cernekee Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 1d4352e1ac81..8460c07686e8 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -3097,6 +3097,7 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, struct brcmf_cfg80211_info *cfg = ifp->drvr->config; s32 status; struct brcmf_escan_result_le *escan_result_le; + u32 escan_buflen; struct brcmf_bss_info_le *bss_info_le; struct brcmf_bss_info_le *bss = NULL; u32 bi_length; @@ -3113,11 +3114,23 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, if (status == BRCMF_E_STATUS_PARTIAL) { brcmf_dbg(SCAN, "ESCAN Partial result\n"); + if (e->datalen < sizeof(*escan_result_le)) { + brcmf_err("invalid event data length\n"); + goto exit; + } escan_result_le = (struct brcmf_escan_result_le *) data; if (!escan_result_le) { brcmf_err("Invalid escan result (NULL pointer)\n"); goto exit; } + escan_buflen = le32_to_cpu(escan_result_le->buflen); + if (escan_buflen > BRCMF_ESCAN_BUF_SIZE || + escan_buflen > e->datalen || + escan_buflen < sizeof(*escan_result_le)) { + brcmf_err("Invalid escan buffer length: %d\n", + escan_buflen); + goto exit; + } if (le16_to_cpu(escan_result_le->bss_count) != 1) { brcmf_err("Invalid bss_count %d: ignoring\n", escan_result_le->bss_count); @@ -3134,9 +3147,8 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, } bi_length = le32_to_cpu(bss_info_le->length); - if (bi_length != (le32_to_cpu(escan_result_le->buflen) - - WL_ESCAN_RESULTS_FIXED_SIZE)) { - brcmf_err("Invalid bss_info length %d: ignoring\n", + if (bi_length != escan_buflen - WL_ESCAN_RESULTS_FIXED_SIZE) { + brcmf_err("Ignoring invalid bss_info length: %d\n", bi_length); goto exit; } From 54aa832c8744bc98021a139226c1c5ff2a172298 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 12 Sep 2017 10:47:54 +0200 Subject: [PATCH 348/555] brcmfmac: setup passive scan if requested by user-space commit 35f62727df0ed8e5e4857e162d94fd46d861f1cf upstream. The driver was not properly configuring firmware with regard to the type of scan. It always performed an active scan even when user-space was requesting for passive scan, ie. the scan request was done without any SSIDs specified. Reported-by: Huang, Jiangyang Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 19 ++++--------------- .../broadcom/brcm80211/brcmfmac/fwil_types.h | 5 +++++ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 8460c07686e8..27960b0bfbcd 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -978,7 +978,7 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, eth_broadcast_addr(params_le->bssid); params_le->bss_type = DOT11_BSSTYPE_ANY; - params_le->scan_type = 0; + params_le->scan_type = BRCMF_SCANTYPE_ACTIVE; params_le->channel_num = 0; params_le->nprobes = cpu_to_le32(-1); params_le->active_time = cpu_to_le32(-1); @@ -986,12 +986,9 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, params_le->home_time = cpu_to_le32(-1); memset(¶ms_le->ssid_le, 0, sizeof(params_le->ssid_le)); - /* if request is null exit so it will be all channel broadcast scan */ - if (!request) - return; - n_ssids = request->n_ssids; n_channels = request->n_channels; + /* Copy channel array if applicable */ brcmf_dbg(SCAN, "### List of channelspecs to scan ### %d\n", n_channels); @@ -1028,16 +1025,8 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, ptr += sizeof(ssid_le); } } else { - brcmf_dbg(SCAN, "Broadcast scan %p\n", request->ssids); - if ((request->ssids) && request->ssids->ssid_len) { - brcmf_dbg(SCAN, "SSID %s len=%d\n", - params_le->ssid_le.SSID, - request->ssids->ssid_len); - params_le->ssid_le.SSID_len = - cpu_to_le32(request->ssids->ssid_len); - memcpy(¶ms_le->ssid_le.SSID, request->ssids->ssid, - request->ssids->ssid_len); - } + brcmf_dbg(SCAN, "Performing passive scan\n"); + params_le->scan_type = BRCMF_SCANTYPE_PASSIVE; } /* Adding mask to channel numbers */ params_le->channel_num = diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h index a4118c0ef6ca..59013572fbe3 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h @@ -45,6 +45,11 @@ #define BRCMF_SCAN_PARAMS_COUNT_MASK 0x0000ffff #define BRCMF_SCAN_PARAMS_NSSID_SHIFT 16 +/* scan type definitions */ +#define BRCMF_SCANTYPE_DEFAULT 0xFF +#define BRCMF_SCANTYPE_ACTIVE 0 +#define BRCMF_SCANTYPE_PASSIVE 1 + /* primary (ie tx) key */ #define BRCMF_PRIMARY_KEY (1 << 1) #define DOT11_BSSTYPE_ANY 2 From acf64334817ceded9810b4d45951fff4cf9fc661 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 21 Sep 2017 17:19:20 +0300 Subject: [PATCH 349/555] drm/i915/bios: ignore HDMI on port A MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2ba7d7e0437127314864238f8bfcb8369d81075c upstream. The hardware state readout oopses after several warnings when trying to use HDMI on port A, if such a combination is configured in VBT. Filter the combo out already at the VBT parsing phase. v2: also ignore DVI (Ville) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102889 Cc: Imre Deak Reviewed-by: Ville Syrjälä Tested-by: Daniel Drake Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20170921141920.18172-1-jani.nikula@intel.com (cherry picked from commit d27ffc1d00327c29b3aa97f941b42f0949f9e99f) Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_bios.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index 4ac36e3c341f..8aeb7f8ee59c 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1152,6 +1152,13 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port, is_hdmi = is_dvi && (child->common.device_type & DEVICE_TYPE_NOT_HDMI_OUTPUT) == 0; is_edp = is_dp && (child->common.device_type & DEVICE_TYPE_INTERNAL_CONNECTOR); + if (port == PORT_A && is_dvi) { + DRM_DEBUG_KMS("VBT claims port A supports DVI%s, ignoring\n", + is_hdmi ? "/HDMI" : ""); + is_dvi = false; + is_hdmi = false; + } + info->supports_dvi = is_dvi; info->supports_hdmi = is_hdmi; info->supports_dp = is_dp; From c83bbed2341925c167b3b0560232e0d39333c692 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 1 Oct 2017 09:37:35 +0200 Subject: [PATCH 350/555] nvme-pci: Use PCI bus address for data/queues in CMB commit 8969f1f8291762c13147c1ba89d46238af01675b upstream. Currently, NVMe PCI host driver is programming CMB dma address as I/O SQs addresses. This results in failures on systems where 1:1 outbound mapping is not used (example Broadcom iProc SOCs) because CMB BAR will be progammed with PCI bus address but NVMe PCI EP will try to access CMB using dma address. To have CMB working on systems without 1:1 outbound mapping, we program PCI bus address for I/O SQs instead of dma address. This approach will work on systems with/without 1:1 outbound mapping. Based on a report and previous patch from Abhishek Shah. Fixes: 8ffaadf7 ("NVMe: Use CMB for the IO SQes if available") Reported-by: Abhishek Shah Tested-by: Abhishek Shah Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/pci.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 14eac73e8dbc..54ea90f89b70 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -96,7 +96,7 @@ struct nvme_dev { struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; - dma_addr_t cmb_dma_addr; + pci_bus_addr_t cmb_bus_addr; u64 cmb_size; u32 cmbsz; u32 cmbloc; @@ -1037,7 +1037,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), dev->ctrl.page_size); - nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; + nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; nvmeq->sq_cmds_io = dev->cmb + offset; } else { nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), @@ -1343,7 +1343,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) resource_size_t bar_size; struct pci_dev *pdev = to_pci_dev(dev->dev); void __iomem *cmb; - dma_addr_t dma_addr; + int bar; dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); if (!(NVME_CMB_SZ(dev->cmbsz))) @@ -1356,7 +1356,8 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); size = szu * NVME_CMB_SZ(dev->cmbsz); offset = szu * NVME_CMB_OFST(dev->cmbloc); - bar_size = pci_resource_len(pdev, NVME_CMB_BIR(dev->cmbloc)); + bar = NVME_CMB_BIR(dev->cmbloc); + bar_size = pci_resource_len(pdev, bar); if (offset > bar_size) return NULL; @@ -1369,12 +1370,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (size > bar_size - offset) size = bar_size - offset; - dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(dev->cmbloc)) + offset; - cmb = ioremap_wc(dma_addr, size); + cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); if (!cmb) return NULL; - dev->cmb_dma_addr = dma_addr; + dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; dev->cmb_size = size; return cmb; } From d9aaef32f32c46b05c4d3bb96fe70e7c4346846e Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Tue, 26 Sep 2017 09:03:40 +0900 Subject: [PATCH 351/555] mmc: core: add driver strength selection when selecting hs400es commit fb458864d9a78cc433fec7979acbe4078c82d7a8 upstream. The driver strength selection is missed and required when selecting hs400es. So, It is added here. Fixes: 81ac2af65793ecf ("mmc: core: implement enhanced strobe support") Signed-off-by: Hankyung Yu Signed-off-by: Chanho Min Reviewed-by: Adrian Hunter Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/mmc.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index b2ca10c4d2a5..4f4a627f6b20 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1255,6 +1255,23 @@ out_err: return err; } +static void mmc_select_driver_type(struct mmc_card *card) +{ + int card_drv_type, drive_strength, drv_type; + + card_drv_type = card->ext_csd.raw_driver_strength | + mmc_driver_type_mask(0); + + drive_strength = mmc_select_drive_strength(card, + card->ext_csd.hs200_max_dtr, + card_drv_type, &drv_type); + + card->drive_strength = drive_strength; + + if (drv_type) + mmc_set_driver_type(card->host, drv_type); +} + static int mmc_select_hs400es(struct mmc_card *card) { struct mmc_host *host = card->host; @@ -1303,6 +1320,8 @@ static int mmc_select_hs400es(struct mmc_card *card) goto out_err; } + mmc_select_driver_type(card); + /* Switch card to HS400 */ val = EXT_CSD_TIMING_HS400 | card->drive_strength << EXT_CSD_DRV_STR_SHIFT; @@ -1336,23 +1355,6 @@ out_err: return err; } -static void mmc_select_driver_type(struct mmc_card *card) -{ - int card_drv_type, drive_strength, drv_type; - - card_drv_type = card->ext_csd.raw_driver_strength | - mmc_driver_type_mask(0); - - drive_strength = mmc_select_drive_strength(card, - card->ext_csd.hs200_max_dtr, - card_drv_type, &drv_type); - - card->drive_strength = drive_strength; - - if (drv_type) - mmc_set_driver_type(card->host, drv_type); -} - /* * For device supporting HS200 mode, the following sequence * should be done before executing the tuning process. From ba15518c2610e777f141b55363b75f410eda7822 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 11:13:38 +0200 Subject: [PATCH 352/555] sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs commit 50e76632339d4655859523a39249dd95ee5e93e7 upstream. Cpusets vs. suspend-resume is _completely_ broken. And it got noticed because it now resulted in non-cpuset usage breaking too. On suspend cpuset_cpu_inactive() doesn't call into cpuset_update_active_cpus() because it doesn't want to move tasks about, there is no need, all tasks are frozen and won't run again until after we've resumed everything. But this means that when we finally do call into cpuset_update_active_cpus() after resuming the last frozen cpu in cpuset_cpu_active(), the top_cpuset will not have any difference with the cpu_active_mask and this it will not in fact do _anything_. So the cpuset configuration will not be restored. This was largely hidden because we would unconditionally create identity domains and mobile users would not in fact use cpusets much. And servers what do use cpusets tend to not suspend-resume much. An addition problem is that we'd not in fact wait for the cpuset work to finish before resuming the tasks, allowing spurious migrations outside of the specified domains. Fix the rebuild by introducing cpuset_force_rebuild() and fix the ordering with cpuset_wait_for_hotplug(). Reported-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Tejun Heo Cc: Thomas Gleixner Fixes: deb7aa308ea2 ("cpuset: reorganize CPU / memory hotplug handling") Link: http://lkml.kernel.org/r/20170907091338.orwxrqkbfkki3c24@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Signed-off-by: Greg Kroah-Hartman --- include/linux/cpuset.h | 6 ++++++ kernel/cpuset.c | 16 +++++++++++++++- kernel/power/process.c | 5 ++++- kernel/sched/core.c | 7 +++---- 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index cd32a49ae81e..d807fa9b2051 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -55,7 +55,9 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); +extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(bool cpu_online); +extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -168,11 +170,15 @@ static inline bool cpusets_enabled(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} +static inline void cpuset_force_rebuild(void) { } + static inline void cpuset_update_active_cpus(bool cpu_online) { partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_wait_for_hotplug(void) { } + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 03a3a6e94eb9..511b1dd8ff09 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2276,6 +2276,13 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2350,8 +2357,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) + if (cpus_updated || force_rebuild) { + force_rebuild = false; rebuild_sched_domains(); + } } void cpuset_update_active_cpus(bool cpu_online) @@ -2370,6 +2379,11 @@ void cpuset_update_active_cpus(bool cpu_online) schedule_work(&cpuset_hotplug_work); } +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); +} + /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. diff --git a/kernel/power/process.c b/kernel/power/process.c index 2fba066e125f..8ea24ded1dab 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -18,8 +18,9 @@ #include #include #include +#include -/* +/* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; @@ -200,6 +201,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2098954c690f..d7dda36fbc7b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7292,16 +7292,15 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - num_cpus_frozen--; - if (likely(num_cpus_frozen)) { - partition_sched_domains(1, NULL, NULL); + partition_sched_domains(1, NULL, NULL); + if (--num_cpus_frozen) return; - } /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); } cpuset_update_active_cpus(true); } From 27db1f020373a0681d483cc2d304e018cbd15723 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 31 Jan 2017 10:34:56 +0200 Subject: [PATCH 353/555] vfs: deny copy_file_range() for non regular files commit 11cbfb10775aa2a01cee966d118049ede9d0bdf2 upstream. There is no in-tree file system that implements copy_file_range() for non regular files. Deny an attempt to copy_file_range() a directory with EISDIR and any other non regualr file with EINVAL to conform with behavior of vfs_{clone,dedup}_file_range(). This change is needed prior to converting sb_start_write() to file_start_write() in the vfs helper. Cc: linux-api@vger.kernel.org Cc: Al Viro Signed-off-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Miklos Szeredi Cc: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/read_write.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/read_write.c b/fs/read_write.c index 09a8757efd34..ba280596ec78 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1518,6 +1518,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (flags != 0) return -EINVAL; + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; From 2d605d9188d64e7d2bd96b4ec5bf91434024a25a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 26 May 2017 17:45:45 -0400 Subject: [PATCH 354/555] ext4: fix data corruption for mmap writes commit a056bdaae7a181f7dcc876cfab2f94538e508709 upstream. mpage_submit_page() can race with another process growing i_size and writing data via mmap to the written-back page. As mpage_submit_page() samples i_size too early, it may happen that ext4_bio_write_page() zeroes out too large tail of the page and thus corrupts user data. Fix the problem by sampling i_size only after the page has been write-protected in page tables by clear_page_dirty_for_io() call. Reported-by: Michael Zimmer Fixes: cb20d5188366f04d96d2e07b1240cc92170ade40 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1b29efcab3dc..ec28e8ebb984 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2107,15 +2107,29 @@ static int ext4_writepage(struct page *page, static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; - loff_t size = i_size_read(mpd->inode); + loff_t size; int err; BUG_ON(page->index != mpd->first_page); + clear_page_dirty_for_io(page); + /* + * We have to be very careful here! Nothing protects writeback path + * against i_size changes and the page can be writeably mapped into + * page tables. So an application can be growing i_size and writing + * data through mmap while writeback runs. clear_page_dirty_for_io() + * write-protects our page in page tables and the page cannot get + * written to again until we release page lock. So only after + * clear_page_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_page() to zero-out tail of the written page. We rely + * on the barrier provided by TestClearPageDirty in + * clear_page_dirty_for_io() to make sure i_size is really sampled only + * after page tables are updated. + */ + size = i_size_read(mpd->inode); if (page->index == size >> PAGE_SHIFT) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - clear_page_dirty_for_io(page); err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; From 6007f0f7a47d2a92aa8122a12e7b08eb5cfe3d53 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 30 Jul 2017 23:33:01 -0400 Subject: [PATCH 355/555] ext4: Don't clear SGID when inheriting ACLs commit a3bb2d5587521eea6dab2d05326abb0afb460abd upstream. When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of __ext4_set_acl() into ext4_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef Signed-off-by: Theodore Ts'o Signed-off-by: Jan Kara Reviewed-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman --- fs/ext4/acl.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dfa519979038..dfd01ca1a60a 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -192,13 +192,6 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - } break; case ACL_TYPE_DEFAULT: @@ -231,6 +224,8 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; int error, retries = 0; + umode_t mode = inode->i_mode; + int update_mode = 0; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, @@ -238,7 +233,20 @@ retry: if (IS_ERR(handle)) return PTR_ERR(handle); + if ((type == ACL_TYPE_ACCESS) && acl) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + goto out_stop; + update_mode = 1; + } + error = __ext4_set_acl(handle, inode, type, acl); + if (!error && update_mode) { + inode->i_mode = mode; + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + } +out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; From 48d7b5a8879002aeea63201d8fab598848cce0dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 28 Dec 2016 00:22:52 -0500 Subject: [PATCH 356/555] ext4: don't allow encrypted operations without keys commit 173b8439e1ba362007315868928bf9d26e5cc5a6 upstream. While we allow deletes without the key, the following should not be permitted: # cd /vdc/encrypted-dir-without-key # ls -l total 4 -rw-r--r-- 1 root root 0 Dec 27 22:35 6,LKNRJsp209FbXoSvJWzB -rw-r--r-- 1 root root 286 Dec 27 22:35 uRJ5vJh9gE7vcomYMqTAyD # mv uRJ5vJh9gE7vcomYMqTAyD 6,LKNRJsp209FbXoSvJWzB Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 423a21cd077c..00b8a5a66961 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3527,6 +3527,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; + if ((ext4_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (ext4_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3726,6 +3732,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, u8 new_file_type; int retval; + if ((ext4_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (ext4_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && (old_dir != new_dir) && From ec86c1ca8fbb3e6bfb6f67b3b3697600fa5bb0d7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 17:31:15 -0800 Subject: [PATCH 357/555] f2fs: don't allow encrypted operations without keys commit 363fa4e078cbdc97a172c19d19dc04b41b52ebc8 upstream. This patch fixes the renaming bug on encrypted filenames, which was pointed by (ext4: don't allow encrypted operations without keys) Cc: Theodore Ts'o Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/namei.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 489fa0d5f914..08d7dc99042e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -663,6 +663,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; @@ -843,6 +849,12 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && (old_dir != new_dir) && (!fscrypt_has_permitted_context(new_dir, old_inode) || From 922e562b2613ae713d661c4fc0f92662f4fe6c41 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 7 Jun 2017 15:13:14 +0200 Subject: [PATCH 358/555] KVM: x86: fix singlestepping over syscall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c8401dda2f0a00cd25c0af6a95ed50e478d25de4 upstream. TF is handled a bit differently for syscall and sysret, compared to the other instructions: TF is checked after the instruction completes, so that the OS can disable #DB at a syscall by adding TF to FMASK. When the sysret is executed the #DB is taken "as if" the syscall insn just completed. KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. Fix the behavior, otherwise you could get #DB on a user stack which is not nice. This does not affect Linux guests, as they use an IST or task gate for #DB. This fixes CVE-2017-7518. Cc: stable@vger.kernel.org Reported-by: Andy Lutomirski Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář [bwh: Backported to 4.9: - kvm_vcpu_check_singlestep() sets some flags differently - Drop changes to kvm_skip_emulated_instruction()] Cc: Ben Hutchings Cc: Salvatore Bonaccorso Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/x86.c | 52 +++++++++++++----------------- 3 files changed, 24 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 19d14ac23ef9..fc3c7e49c8e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -296,6 +296,7 @@ struct x86_emulate_ctxt { bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ + bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; struct x86_exception exception; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index de36660751b5..72b737b8c9d6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2738,6 +2738,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); } + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3dbcb09c19cf..595f8149c0d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5250,6 +5250,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; + ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : @@ -5465,37 +5467,26 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) { struct kvm_run *kvm_run = vcpu->run; - /* - * rflags is the old, "raw" value of the flags. The new value has - * not been saved yet. - * - * This is correct even for TF set by the guest, because "the - * processor will not generate this exception after the instruction - * that sets the TF flag". - */ - if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | - DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; + kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5650,8 +5641,9 @@ restart: toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + if (r == EMULATE_DONE && + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) __kvm_set_rflags(vcpu, ctxt->eflags); From f82786d7a94f06a35ab273002cedc1385bae8e9f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 12 Oct 2017 11:51:27 +0200 Subject: [PATCH 359/555] Linux 4.9.55 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8370937bbb22..2a995675d6bf 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 54 +SUBLEVEL = 55 EXTRAVERSION = NAME = Roaring Lionus From 6cd263155017036dc2f48cc5dc35ddc4fa5ce61e Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Tue, 26 Sep 2017 17:56:25 +0800 Subject: [PATCH 360/555] ANDROID: binder: fix node sched policy calculation We should use FLAT_BINDER_FLAG_SCHED_POLICY_MASK as the mask to calculate sched policy. Change-Id: Ic252fd7c68495830690130d792802c02f99fc8fc Signed-off-by: Ganesh Mahendran --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index cde60960fe65..e57ef44be54c 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1302,7 +1302,7 @@ static struct binder_node *binder_init_node_ilocked( node->cookie = cookie; node->work.type = BINDER_WORK_NODE; priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; - node->sched_policy = (flags & FLAT_BINDER_FLAG_PRIORITY_MASK) >> + node->sched_policy = (flags & FLAT_BINDER_FLAG_SCHED_POLICY_MASK) >> FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT; node->min_priority = to_kernel_prio(node->sched_policy, priority); node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); From 00449628f3526aef5b682cc4c18f6c422d3be810 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 12 Oct 2017 21:21:39 +0200 Subject: [PATCH 361/555] Revert "socket, bpf: fix possible use after free" This reverts commit 02f7e4101092b88e57c73171174976c8a72a3eba, which was commit 02f7e4101092b88e57c73171174976c8a72a3eba upstream Turns out the backport to 4.9 was broken. Reported-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 15 ++------------- net/core/sock.c | 5 +---- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index bfeedbbde214..4eb4ce0aeef4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -937,31 +937,20 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) /* try to charge the socket memory if there is space available * return true on success */ -static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) +bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= sysctl_optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { + atomic_inc(&fp->refcnt); atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } -bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) -{ - if (!atomic_inc_not_zero(&fp->refcnt)) - return false; - - if (!__sk_filter_charge(sk, fp)) { - sk_filter_release(fp); - return false; - } - return true; -} - static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; diff --git a/net/core/sock.c b/net/core/sock.c index 2a77cc50f021..231c38d91855 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1528,16 +1528,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); - rcu_read_lock(); - filter = rcu_dereference(sk->sk_filter); + filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); - RCU_INIT_POINTER(newsk->sk_filter, filter); - rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new From 9d36d3eff2f85efad0a3b0c6031081654ae33928 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 12 Oct 2017 21:24:22 +0200 Subject: [PATCH 362/555] Linux 4.9.56 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2a995675d6bf..feab5f5a507c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 55 +SUBLEVEL = 56 EXTRAVERSION = NAME = Roaring Lionus From 63da4200cbc27828bb9439b84059aa48cda5adf9 Mon Sep 17 00:00:00 2001 From: Hyojun Kim Date: Fri, 6 Oct 2017 17:10:08 -0700 Subject: [PATCH 363/555] f2fs: catch up to v4.14-rc1 Cherry-picked from upstream-f2fs-stable-linux-4.9.y Changes include: commit 30da3a4de96733 ("f2fs: hurry up to issue discard after io interruption") commit d1c363b48398d4 ("f2fs: fix to show correct discard_granularity in sysfs") ... commit e6b120d4d01ab0 ("f2fs/fscrypt: catch up to v4.12") commit 4d7931d72758db ("KEYS: Differentiate uses of rcu_dereference_key() and user_key_payload()") Signed-off-by: Hyojun Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 39 + Documentation/filesystems/f2fs.txt | 23 + Documentation/security/keys.txt | 17 +- fs/cifs/connect.c | 2 +- fs/crypto/Kconfig | 3 - fs/crypto/Makefile | 1 + fs/crypto/bio.c | 145 ++ fs/crypto/crypto.c | 269 ++-- fs/crypto/fname.c | 109 +- fs/crypto/fscrypt_private.h | 97 ++ fs/crypto/keyinfo.c | 222 +++- fs/crypto/policy.c | 151 +-- fs/ecryptfs/ecryptfs_kernel.h | 2 +- fs/ext4/ext4.h | 57 +- fs/ext4/ialloc.c | 20 +- fs/ext4/inline.c | 2 +- fs/ext4/inode.c | 6 +- fs/ext4/ioctl.c | 37 +- fs/ext4/namei.c | 133 +- fs/ext4/page-io.c | 4 +- fs/ext4/super.c | 68 +- fs/f2fs/Makefile | 2 +- fs/f2fs/acl.c | 9 +- fs/f2fs/checkpoint.c | 268 ++-- fs/f2fs/data.c | 903 +++++++++---- fs/f2fs/debug.c | 82 +- fs/f2fs/dir.c | 123 +- fs/f2fs/extent_cache.c | 379 +++--- fs/f2fs/f2fs.h | 1496 ++++++++++++++------- fs/f2fs/file.c | 553 +++++--- fs/f2fs/gc.c | 270 ++-- fs/f2fs/gc.h | 27 +- fs/f2fs/inline.c | 186 +-- fs/f2fs/inode.c | 211 ++- fs/f2fs/namei.c | 190 ++- fs/f2fs/node.c | 977 ++++++++++---- fs/f2fs/node.h | 81 +- fs/f2fs/recovery.c | 148 ++- fs/f2fs/segment.c | 1548 +++++++++++++++++----- fs/f2fs/segment.h | 248 ++-- fs/f2fs/shrinker.c | 10 +- fs/f2fs/super.c | 1457 ++++++++++++++------ fs/f2fs/sysfs.c | 556 ++++++++ fs/f2fs/trace.c | 4 +- fs/f2fs/xattr.c | 182 ++- fs/f2fs/xattr.h | 11 +- fs/fscache/object-list.c | 2 +- fs/nfs/nfs4idmap.c | 2 +- include/keys/user-type.h | 9 +- include/linux/f2fs_fs.h | 69 +- include/linux/fscrypt_common.h | 138 ++ include/linux/fscrypt_notsupp.h | 177 +++ include/linux/fscrypt_supp.h | 145 ++ include/linux/fscrypto.h | 409 ------ include/linux/key.h | 5 +- include/trace/events/f2fs.h | 324 ++++- include/uapi/linux/fs.h | 31 +- lib/digsig.c | 2 +- net/dns_resolver/dns_query.c | 4 +- security/keys/dh.c | 2 +- security/keys/encrypted-keys/encrypted.c | 4 +- security/keys/trusted.c | 4 +- security/keys/user_defined.c | 6 +- 63 files changed, 8842 insertions(+), 3819 deletions(-) create mode 100644 fs/crypto/bio.c create mode 100644 fs/crypto/fscrypt_private.h create mode 100644 fs/f2fs/sysfs.c create mode 100644 include/linux/fscrypt_common.h create mode 100644 include/linux/fscrypt_notsupp.h create mode 100644 include/linux/fscrypt_supp.h delete mode 100644 include/linux/fscrypto.h diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index a809f6005f14..cecb93af8967 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -57,6 +57,15 @@ Contact: "Jaegeuk Kim" Description: Controls the issue rate of small discard commands. +What: /sys/fs/f2fs//discard_granularity +Date: July 2017 +Contact: "Chao Yu" +Description: + Controls discard granularity of inner discard thread, inner thread + will not issue discards with size that is smaller than granularity. + The unit size is one block, now only support configuring in range + of [1, 512]. + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" @@ -112,3 +121,33 @@ Date: January 2016 Contact: "Shuoran Liu" Description: Shows total written kbytes issued to disk. + +What: /sys/fs/f2fs//inject_rate +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection rate. + +What: /sys/fs/f2fs//inject_type +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection type. + +What: /sys/fs/f2fs//reserved_blocks +Date: June 2017 +Contact: "Chao Yu" +Description: + Controls current reserved blocks in system. + +What: /sys/fs/f2fs//gc_urgent +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Do background GC agressively + +What: /sys/fs/f2fs//gc_urgent_sleep_time +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Controls sleep time of GC urgent mode diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 753dd4f96afe..e7fb61ee6583 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,20 @@ data_flush Enable data flushing before checkpoint in order to mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. +io_bits=%u Set the bit size of write IO requests. It should be set + with "mode=lfs". +usrquota Enable plain user disk quota accounting. +grpquota Enable plain group disk quota accounting. +prjquota Enable plain project quota accounting. +usrjquota= Appoint specified file and type during mount, so that quota +grpjquota= information can be properly updated during recovery flow, +prjjquota= : must be in root directory; +jqfmt= : [vfsold,vfsv0,vfsv1]. +offusrjquota Turn off user journelled quota. +offgrpjquota Turn off group journelled quota. +offprjjquota Turn off project journelled quota. +quota Enable plain user disk quota accounting. +noquota Disable all plain disk quota option. ================================================================================ DEBUGFS ENTRIES @@ -202,6 +216,15 @@ Files in /sys/fs/f2fs/ gc_idle = 1 will select the Cost Benefit approach & setting gc_idle = 2 will select the greedy approach. + gc_urgent This parameter controls triggering background GCs + urgently or not. Setting gc_urgent = 0 [default] + makes back to default behavior, while if it is set + to 1, background thread starts to do GC by given + gc_urgent_sleep_time interval. + + gc_urgent_sleep_time This parameter controls sleep time for gc_urgent. + 500 ms is set by default. See above gc_urgent. + reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree segments is larger than the number of segments diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index 3849814bfe6d..0e03baf271bd 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -1151,8 +1151,21 @@ access the data: usage. This is called key->payload.rcu_data0. The following accessors wrap the RCU calls to this element: - rcu_assign_keypointer(struct key *key, void *data); - void *rcu_dereference_key(struct key *key); + (a) Set or change the first payload pointer: + + rcu_assign_keypointer(struct key *key, void *data); + + (b) Read the first payload pointer with the key semaphore held: + + [const] void *dereference_key_locked([const] struct key *key); + + Note that the return value will inherit its constness from the key + parameter. Static analysis will give an error if it things the lock + isn't held. + + (c) Read the first payload pointer with the RCU read lock held: + + const void *dereference_key_rcu(const struct key *key); =================== diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f6712b6128d8..c0b4d728f1d1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2432,7 +2432,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } down_read(&key->sem); - upayload = user_key_payload(key); + upayload = user_key_payload_locked(key); if (IS_ERR_OR_NULL(upayload)) { rc = upayload ? PTR_ERR(upayload) : -EINVAL; goto out_key_put; diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 92348faf9865..08b46e6e3995 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -1,6 +1,5 @@ config FS_ENCRYPTION tristate "FS Encryption (Per-file encryption)" - depends on BLOCK select CRYPTO select CRYPTO_AES select CRYPTO_CBC @@ -8,9 +7,7 @@ config FS_ENCRYPTION select CRYPTO_XTS select CRYPTO_CTS select CRYPTO_CTR - select CRYPTO_SHA256 select KEYS - select ENCRYPTED_KEYS help Enable encryption of files and directories. This feature is similar to ecryptfs, but it is more memory diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index f17684c48739..9f6607f17b53 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c new file mode 100644 index 000000000000..2596c9ad6d77 --- /dev/null +++ b/fs/crypto/bio.c @@ -0,0 +1,145 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include "fscrypt_private.h" + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + PAGE_SIZE, 0, GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(bio); + if (err == 0 && bio->bi_error) + err = -EIO; + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 61cfccea77bc..c7835df7e7b8 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -24,10 +24,10 @@ #include #include #include -#include #include #include -#include +#include +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -44,7 +44,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -static struct workqueue_struct *fscrypt_read_workqueue; +struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; @@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -141,20 +141,15 @@ static void page_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -typedef enum { - FS_DECRYPT = 0, - FS_ENCRYPT, -} fscrypt_direction_t; - -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page, - gfp_t gfp_flags) +int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags) { struct { __le64 index; - u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; - } xts_tweak; + u8 padding[FS_IV_SIZE - sizeof(__le64)]; + } iv; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -162,6 +157,18 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + + BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE); + BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE); + iv.index = cpu_to_le64(lblk_num); + memset(iv.padding, 0, sizeof(iv.padding)); + + if (ci->ci_essiv_tfm != NULL) { + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv, + (u8 *)&iv); + } + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -174,15 +181,11 @@ static int do_page_crypto(struct inode *inode, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); - memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -202,53 +205,86 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags) { ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page, + ciphertext_page, len, offs, + gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, len, offs, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +301,13 @@ errout: EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,76 +315,17 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, + len, offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) -{ - struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; - pblk++; - } - err = 0; -errout: - fscrypt_release_ctx(ctx); - return err; -} -EXPORT_SYMBOL(fscrypt_zeroout_range); - /* * Validate dentries for encrypted directories to make sure we aren't * potentially caching stale data after a key has been added or @@ -392,63 +374,6 @@ const struct dentry_operations fscrypt_d_ops = { }; EXPORT_SYMBOL(fscrypt_d_ops); -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else { - SetPageUptodate(page); - } - unlock_page(page); - } - fscrypt_release_ctx(ctx); - bio_put(bio); -} - -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); -} -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); - -void fscrypt_pullback_bio_page(struct page **page, bool restore) -{ - struct fscrypt_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct fscrypt_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - if (restore) - fscrypt_restore_control_page(bounce_page); -} -EXPORT_SYMBOL(fscrypt_pullback_bio_page); - void fscrypt_restore_control_page(struct page *page) { struct fscrypt_ctx *ctx; @@ -474,17 +399,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -513,7 +443,6 @@ fail: mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. @@ -555,6 +484,8 @@ static void __exit fscrypt_exit(void) destroy_workqueue(fscrypt_read_workqueue); kmem_cache_destroy(fscrypt_ctx_cachep); kmem_cache_destroy(fscrypt_info_cachep); + + fscrypt_essiv_cleanup(); } module_exit(fscrypt_exit); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d1bbdc9dda76..ad9f814fdead 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include #include -#include +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -209,7 +211,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,14 +229,17 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -332,14 +345,39 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, * in a directory. Consequently, a user space name cannot be mapped to * a disk-space name */ - return -EACCES; + return -ENOKEY; } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -367,31 +405,43 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } if (!lookup) - return -EACCES; + return -ENOKEY; /* * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; @@ -403,12 +453,3 @@ errout: return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); - -void fscrypt_free_filename(struct fscrypt_name *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} -EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..a1d5021c31ef --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,97 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include +#include + +/* Encryption parameters */ +#define FS_IV_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_128_CBC_KEY_SIZE 16 +#define FS_AES_128_CTS_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct crypto_cipher *ci_essiv_tfm; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } + + +/* crypto.c */ +extern int fscrypt_initialize(unsigned int cop_flags); +extern struct workqueue_struct *fscrypt_read_workqueue; +extern int fscrypt_do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, + struct page *src_page, + struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); +extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags); + +/* keyinfo.c */ +extern void __exit fscrypt_essiv_cleanup(void); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index bb4606368eb1..018c588c7ac3 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,12 @@ #include #include -#include +#include +#include +#include +#include "fscrypt_private.h" + +static struct crypto_shash *essiv_hash_tfm; static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. + * @derived_raw_key: Derived raw key. * * Return: Zero on success; non-zero otherwise. */ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - u8 source_key[FS_AES_256_XTS_KEY_SIZE], - u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) + const struct fscrypt_key *source_key, + u8 derived_raw_key[FS_MAX_KEY_SIZE]) { int res = 0; struct skcipher_request *req = NULL; @@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], if (res < 0) goto out; - sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - FS_AES_256_XTS_KEY_SIZE, NULL); + sg_init_one(&src_sg, source_key->raw, source_key->size); + sg_init_one(&dst_sg, derived_raw_key, source_key->size); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + NULL); res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); @@ -77,26 +82,22 @@ out: static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - u8 *prefix, int prefix_size) + const char *prefix, int min_keysize) { - u8 *full_key_descriptor; + char *description; struct key *keyring_key; struct fscrypt_key *master_key; const struct user_key_payload *ukp; - int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; int res; - full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); - if (!full_key_descriptor) + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + if (!description) return -ENOMEM; - memcpy(full_key_descriptor, prefix, prefix_size); - sprintf(full_key_descriptor + prefix_size, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); - full_key_descriptor[full_key_len - 1] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - kfree(full_key_descriptor); + keyring_key = request_key(&key_type_logon, description, NULL); + kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); down_read(&keyring_key->sem); @@ -107,7 +108,7 @@ static int validate_user_key(struct fscrypt_info *crypt_info, res = -ENOKEY; goto out; } - ukp = user_key_payload(keyring_key); + ukp = user_key_payload_locked(keyring_key); if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; goto out; @@ -115,50 +116,60 @@ static int validate_user_key(struct fscrypt_info *crypt_info, master_key = (struct fscrypt_key *)ukp->data; BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE + || master_key->size % AES_BLOCK_SIZE != 0) { printk_once(KERN_WARNING "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; goto out; } - res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + res = derive_key_aes(ctx->nonce, master_key, raw_key); out: up_read(&keyring_key->sem); key_put(keyring_key); return res; } +static const struct { + const char *cipher_str; + int keysize; +} available_modes[] = { + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", + FS_AES_256_XTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", + FS_AES_256_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", + FS_AES_128_CBC_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", + FS_AES_128_CTS_KEY_SIZE }, +}; + static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, const char **cipher_str_ret, int *keysize_ret) { + u32 mode; + + if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { + pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", + inode->i_ino, + ci->ci_data_mode, ci->ci_filename_mode); + return -EINVAL; + } + if (S_ISREG(inode->i_mode)) { - if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { - *cipher_str_ret = "xts(aes)"; - *keysize_ret = FS_AES_256_XTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported contents encryption mode " - "%d for inode %lu\n", - ci->ci_data_mode, inode->i_ino); - return -ENOKEY; + mode = ci->ci_data_mode; + } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + mode = ci->ci_filename_mode; + } else { + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return -EINVAL; } - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { - *cipher_str_ret = "cts(cbc(aes))"; - *keysize_ret = FS_AES_256_CTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported filenames encryption mode " - "%d for inode %lu\n", - ci->ci_filename_mode, inode->i_ino); - return -ENOKEY; - } - - pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", - (inode->i_mode & S_IFMT), inode->i_ino); - return -ENOKEY; + *cipher_str_ret = available_modes[mode].cipher_str; + *keysize_ret = available_modes[mode].keysize; + return 0; } static void put_crypt_info(struct fscrypt_info *ci) @@ -167,9 +178,76 @@ static void put_crypt_info(struct fscrypt_info *ci) return; crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); kmem_cache_free(fscrypt_info_cachep, ci); } +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + desc->flags = 0; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +void __exit fscrypt_essiv_cleanup(void) +{ + crypto_free_shash(essiv_hash_tfm); +} + int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; @@ -183,21 +261,21 @@ int fscrypt_get_encryption_info(struct inode *inode) if (inode->i_crypt_info) return 0; - res = fscrypt_initialize(); + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode)) + if (!fscrypt_dummy_context_enabled(inode) || + inode->i_sb->s_cop->is_encrypted(inode)) return res; + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); } else if (res != sizeof(ctx)) { return -EINVAL; } @@ -216,6 +294,7 @@ int fscrypt_get_encryption_info(struct inode *inode) crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; + crypt_info->ci_essiv_tfm = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -232,20 +311,12 @@ int fscrypt_get_encryption_info(struct inode *inode) if (!raw_key) goto out; - if (fscrypt_dummy_context_enabled(inode)) { - memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); - goto got_key; - } - - res = validate_user_key(crypt_info, &ctx, raw_key, - FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, + keysize); if (res && inode->i_sb->s_cop->key_prefix) { - u8 *prefix = NULL; - int prefix_size, res2; - - prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); - res2 = validate_user_key(crypt_info, &ctx, raw_key, - prefix, prefix_size); + int res2 = validate_user_key(crypt_info, &ctx, raw_key, + inode->i_sb->s_cop->key_prefix, + keysize); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -254,22 +325,33 @@ int fscrypt_get_encryption_info(struct inode *inode) } else if (res) { goto out; } -got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); + pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", + __func__, res, inode->i_ino); goto out; } crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + /* + * if the provided key is longer than keysize, we use the first + * keysize bytes of the derived key only + */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; + if (S_ISREG(inode->i_mode) && + crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { + res = init_essiv_generator(crypt_info, raw_key, keysize); + if (res) { + pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", + __func__, res, inode->i_ino); + goto out; + } + } if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) crypt_info = NULL; out: diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index bb4e209bd809..9914d51dff86 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,76 +10,37 @@ #include #include -#include #include - -static int inode_has_encryption_context(struct inode *inode) -{ - if (!inode->i_sb->s_cop->get_context) - return 0; - return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); -} +#include "fscrypt_private.h" /* - * check whether the policy is consistent with the encryption context - * for the inode + * check whether an encryption policy is consistent with an encryption context */ -static int is_encryption_context_consistent_with_policy(struct inode *inode, +static bool is_encryption_context_consistent_with_policy( + const struct fscrypt_context *ctx, const struct fscrypt_policy *policy) { - struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->get_context) - return 0; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); + return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx->flags == policy->flags) && + (ctx->contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx->filenames_encryption_mode == + policy->filenames_encryption_mode); } static int create_encryption_context_from_policy(struct inode *inode, const struct fscrypt_policy *policy) { struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - - if (inode->i_sb->s_cop->prepare_context) { - res = inode->i_sb->s_cop->prepare_context(inode); - if (res) - return res; - } ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); - if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) return -EINVAL; - } - - if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } if (policy->flags & ~FS_POLICY_FLAGS_VALID) return -EINVAL; @@ -93,16 +54,20 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + struct fscrypt_context ctx; + + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -111,22 +76,23 @@ int fscrypt_process_policy(struct file *filp, inode_lock(inode); - if (!inode_has_encryption_context(inode)) { + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) - ret = -EINVAL; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; + ret = -ENOTDIR; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); - } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { - printk(KERN_WARNING - "%s: Policy inconsistent with encryption context\n", - __func__); - ret = -EINVAL; + &policy); + } else if (ret == sizeof(ctx) && + is_encryption_context_consistent_with_policy(&ctx, + &policy)) { + /* The file already uses the same encryption policy. */ + ret = 0; + } else if (ret >= 0 || ret == -ERANGE) { + /* The file already uses a different encryption policy. */ + ret = -EEXIST; } inode_unlock(inode); @@ -134,32 +100,38 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0 && res != -ERANGE) + return res; if (res != sizeof(ctx)) - return -ENODATA; + return -EINVAL; if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); /** * fscrypt_has_permitted_context() - is a file's encryption policy permitted @@ -258,9 +230,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); * @parent: Parent inode from which the context is inherited. * @child: Child inode that inherits the context from @parent. * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info + * @preload: preload child i_crypt_info if true * - * Return: Zero on success, non-zero otherwise + * Return: 0 on success, -errno on failure */ int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) @@ -269,9 +241,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; @@ -281,19 +250,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return -ENOKEY; ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - if (fscrypt_dummy_context_enabled(parent)) { - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE); - } + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 599a29237cfe..95c1c8d34539 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -117,7 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key) auth_tok = ecryptfs_get_encrypted_key_payload_data(key); if (!auth_tok) - return (struct ecryptfs_auth_tok *)user_key_payload(key)->data; + return (struct ecryptfs_auth_tok *)user_key_payload_locked(key)->data; else return auth_tok; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 20ee0e4a5829..be2d9ae0a749 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -32,7 +32,11 @@ #include #include #include -#include +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#include +#else +#include +#endif #include #include #ifdef __KERNEL__ @@ -1342,11 +1346,6 @@ struct ext4_super_block { /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 -#ifdef CONFIG_EXT4_FS_ENCRYPTION -#define EXT4_KEY_DESC_PREFIX "ext4:" -#define EXT4_KEY_DESC_PREFIX_SIZE 5 -#endif - /* * fourth extended-fs super-block data in memory */ @@ -1516,12 +1515,6 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; - - /* Encryption support */ -#ifdef CONFIG_EXT4_FS_ENCRYPTION - u8 key_prefix[EXT4_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; -#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2277,11 +2270,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -static inline int ext4_sb_has_crypto(struct super_block *sb) -{ - return ext4_has_feature_encrypt(sb); -} - static inline bool ext4_encrypted_inode(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); @@ -2330,28 +2318,6 @@ static inline int ext4_fname_setup_filename(struct inode *dir, } static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk #endif /* dir.c */ @@ -2373,17 +2339,16 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); -int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - struct ext4_filename *fname); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!ext4_has_feature_dir_index(inode->i_sb)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } -static unsigned char ext4_filetype_table[] = { +static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; @@ -3071,7 +3036,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 170421edfdfe..2cec6059e886 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -771,7 +771,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (err) return ERR_PTR(err); if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-EPERM); + return ERR_PTR(-ENOKEY); if (!handle) nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); encrypt = 1; @@ -1093,6 +1093,17 @@ got: if (err) goto fail_drop; + /* + * Since the encryption xattr will always be unique, create it first so + * that it's less likely to end up in an external xattr block and + * prevent its deduplication. + */ + if (encrypt) { + err = fscrypt_inherit_context(dir, inode, handle, true); + if (err) + goto fail_free_drop; + } + err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; @@ -1114,13 +1125,6 @@ got: ei->i_datasync_tid = handle->h_transaction->t_tid; } - if (encrypt) { - /* give pointer to avoid set_context with journal ops. */ - err = fscrypt_inherit_context(dir, inode, &encrypt, true); - if (err) - goto fail_free_drop; - } - err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a5807fd03b34..4d78b932c320 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1035,7 +1035,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(dir, inode, de, inline_size, fname); + ext4_insert_dentry(inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ea248df06af9..d887e32e88a0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1166,7 +1166,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = fscrypt_decrypt_page(page); + err = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); return err; } #endif @@ -3824,7 +3825,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); - WARN_ON_ONCE(fscrypt_decrypt_page(page)); + WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, + page, PAGE_SIZE, 0, page->index)); } } if (ext4_should_journal_data(inode)) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index cec9280950b8..15ca15cb9f28 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -191,6 +191,7 @@ journal_err_out: return err; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION static int uuid_is_zero(__u8 u[16]) { int i; @@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16]) return 0; return 1; } +#endif static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) @@ -770,28 +772,19 @@ resizefs_out: } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); - case EXT4_IOC_SET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; + case EXT4_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - if (copy_from_user(&policy, - (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - return fscrypt_process_policy(filp, &policy); -#else - return -EOPNOTSUPP; -#endif - } case EXT4_IOC_GET_ENCRYPTION_PWSALT: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; - if (!ext4_sb_has_crypto(sb)) + if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); @@ -821,30 +814,18 @@ resizefs_out: sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; - } - case EXT4_IOC_GET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; - int err = 0; - - if (!ext4_encrypted_inode(inode)) - return -ENOENT; - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; #else return -EOPNOTSUPP; #endif } + case EXT4_IOC_GET_ENCRYPTION_POLICY: + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; memset(&fa, 0, sizeof(struct fsxattr)); - ext4_get_inode_flags(ei); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); if (ext4_has_feature_project(inode->i_sb)) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 00b8a5a66961..c30889b60848 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1237,37 +1237,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) } /* - * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. + * Test whether a directory entry matches the filename being searched for. * - * `len <= EXT4_NAME_LEN' is guaranteed by caller. - * `de != NULL' is guaranteed by caller. + * Return: %true if the directory entry matches, otherwise %false. */ -static inline int ext4_match(struct ext4_filename *fname, - struct ext4_dir_entry_2 *de) +static inline bool ext4_match(const struct ext4_filename *fname, + const struct ext4_dir_entry_2 *de) { - const void *name = fname_name(fname); - u32 len = fname_len(fname); + struct fscrypt_name f; if (!de->inode) - return 0; + return false; + f.usr_fname = fname->usr_fname; + f.disk_name = fname->disk_name; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (unlikely(!name)) { - if (fname->usr_fname->name[0] == '_') { - int ret; - if (de->name_len <= 32) - return 0; - ret = memcmp(de->name + ((de->name_len - 17) & ~15), - fname->crypto_buf.name + 8, 16); - return (ret == 0) ? 1 : 0; - } - name = fname->crypto_buf.name; - len = fname->crypto_buf.len; - } + f.crypto_buf = fname->crypto_buf; #endif - if (de->name_len != len) - return 0; - return (memcmp(de->name, name, len) == 0) ? 1 : 0; + return fscrypt_match_name(&f, de->name, de->name_len); } /* @@ -1281,48 +1268,31 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - int res; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit) { - res = ext4_match(fname, de); - if (res < 0) { - res = -1; - goto return_result; - } - if (res > 0) { - /* found a match - just to be sure, do - * a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, - bh->b_size, offset)) { - res = -1; - goto return_result; - } - *res_dir = de; - res = 1; - goto return_result; - } - + if ((char *) de + de->name_len <= dlimit && + ext4_match(fname, de)) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) + return -1; + *res_dir = de; + return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (de_len <= 0) { - res = -1; - goto return_result; - } + if (de_len <= 0) + return -1; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } - - res = 0; -return_result: - return res; + return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, @@ -1378,6 +1348,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, return NULL; retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (retval == -ENOENT) + return NULL; if (retval) return ERR_PTR(retval); @@ -1614,16 +1586,9 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - int nokey = ext4_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - if (nokey) { - iput(inode); - return ERR_PTR(-ENOKEY); - } ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", - (unsigned long) dir->i_ino, - (unsigned long) inode->i_ino); + dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } @@ -1831,24 +1796,15 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, int nlen, rlen; unsigned int offset = 0; char *top; - int res; de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, - buf, buf_size, offset)) { - res = -EFSCORRUPTED; - goto return_result; - } - /* Provide crypto context and crypto buffer to ext4 match */ - res = ext4_match(fname, de); - if (res < 0) - goto return_result; - if (res > 0) { - res = -EEXIST; - goto return_result; - } + buf, buf_size, offset)) + return -EFSCORRUPTED; + if (ext4_match(fname, de)) + return -EEXIST; nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) @@ -1856,22 +1812,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } - if ((char *) de > top) - res = -ENOSPC; - else { - *dest_de = de; - res = 0; - } -return_result: - return res; + return -ENOSPC; + + *dest_de = de; + return 0; } -int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - struct ext4_filename *fname) +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname) { int nlen, rlen; @@ -1890,7 +1841,6 @@ int ext4_insert_dentry(struct inode *dir, ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); - return 0; } /* @@ -1926,11 +1876,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err; } - /* By now the buffer is marked for journaling. Due to crypto operations, - * the following function call may fail */ - err = ext4_insert_dentry(dir, inode, de, blocksize, fname); - if (err < 0) - return err; + /* By now the buffer is marked for journaling */ + ext4_insert_dentry(inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful @@ -3090,7 +3037,7 @@ static int ext4_symlink(struct inode *dir, if (err) return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); sd = kzalloc(disk_link.len, GFP_KERNEL); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0094923e5ebf..0718a8667f0c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -24,7 +24,6 @@ #include #include #include -#include #include "ext4_jbd2.h" #include "xattr.h" @@ -470,7 +469,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; retry_encrypt: - data_page = fscrypt_encrypt_page(inode, page, gfp_flags); + data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, + page->index, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f72535e1898f..01b39e7447d6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1102,51 +1102,65 @@ static int ext4_get_context(struct inode *inode, void *ctx, size_t len) EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); } -static int ext4_key_prefix(struct inode *inode, u8 **key) -{ - *key = EXT4_SB(inode->i_sb)->key_prefix; - return EXT4_SB(inode->i_sb)->key_prefix_size; -} - -static int ext4_prepare_context(struct inode *inode) -{ - return ext4_convert_inline_data(inode); -} - static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { - handle_t *handle; - int res, res2; + handle_t *handle = fs_data; + int res, res2, retries = 0; - /* fs_data is null when internally used. */ - if (fs_data) { - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + res = ext4_convert_inline_data(inode); + if (res) + return res; + + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ + + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - e.g. S_DAX may get disabled + */ + ext4_set_inode_flags(inode); } return res; } + res = dquot_initialize(inode); + if (res) + return res; +retry: handle = ext4_journal_start(inode, EXT4_HT_MISC, ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) return PTR_ERR(handle); - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* Update inode->i_flags - e.g. S_DAX may get disabled */ + ext4_set_inode_flags(inode); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (!res) res = res2; return res; @@ -1163,10 +1177,9 @@ static unsigned ext4_max_namelen(struct inode *inode) EXT4_NAME_LEN; } -static struct fscrypt_operations ext4_cryptops = { +static const struct fscrypt_operations ext4_cryptops = { + .key_prefix = "ext4:", .get_context = ext4_get_context, - .key_prefix = ext4_key_prefix, - .prepare_context = ext4_prepare_context, .set_context = ext4_set_context, .dummy_context = ext4_dummy_context, .is_encrypted = ext4_encrypted_inode, @@ -1174,7 +1187,7 @@ static struct fscrypt_operations ext4_cryptops = { .max_namelen = ext4_max_namelen, }; #else -static struct fscrypt_operations ext4_cryptops = { +static const struct fscrypt_operations ext4_cryptops = { .is_encrypted = ext4_encrypted_inode, }; #endif @@ -4220,11 +4233,6 @@ no_journal: ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); kfree(orig_data); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - memcpy(sbi->key_prefix, EXT4_KEY_DESC_PREFIX, - EXT4_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = EXT4_KEY_DESC_PREFIX_SIZE; -#endif return 0; cantfind_ext4: diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ca949ea7c02f..a0dc559b1b47 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o -f2fs-y += shrinker.o extent_cache.o +f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 55aa29c0c78d..436b3a1464d9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -207,15 +207,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; @@ -233,7 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); - return (int)PTR_ERR(value); + return PTR_ERR(value); } } @@ -384,7 +385,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b4dbc2f59656..04fe1df052b2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -31,7 +31,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); } /* @@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = READ_SYNC | REQ_META | REQ_PRIO, + .op_flags = REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, @@ -160,8 +160,9 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, + .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, + .in_list = false, }; struct blk_plug plug; @@ -207,12 +208,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; - fio.old_blkaddr = fio.new_blkaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_bio(&fio); f2fs_put_page(page, 0); } out: - f2fs_submit_merged_bio(sbi, META, READ); blk_finish_plug(&plug); return blkno - start; } @@ -228,11 +227,12 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); + ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -245,16 +245,17 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, META); unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, META); return 0; @@ -263,23 +264,33 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -291,7 +302,7 @@ skip_write: } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -342,7 +353,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -356,7 +367,7 @@ continue_unlock: } stop: if (nwritten) - f2fs_submit_merged_bio(sbi, type, WRITE); + f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); @@ -493,6 +504,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); + f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } #endif @@ -565,7 +577,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } @@ -576,11 +588,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -596,14 +621,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -675,14 +707,13 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block - + crc_offset))); + crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; @@ -770,7 +801,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) /* Sanity checking of checkpoint */ if (sanity_check_ckpt(sbi)) - goto fail_no_cp; + goto free_fail_no_cp; if (cur_page == cp1) sbi->cur_cp_pack = 1; @@ -798,6 +829,9 @@ done: f2fs_put_page(cp2, 1); return 0; +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); fail_no_cp: kfree(sbi->ckpt); return -EINVAL; @@ -812,7 +846,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -870,6 +906,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? @@ -888,18 +925,34 @@ retry: F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { + unsigned long cur_ino = inode->i_ino; + + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + } else { + ino = cur_ino; + } } else { /* * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; @@ -921,18 +974,35 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) spin_unlock(&sbi->inode_lock[DIRTY_META]); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, + fi = list_first_entry(head, struct f2fs_inode_info, gdirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { - update_inode_page(inode); + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + update_inode_page(inode); iput(inode); } }; return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -956,33 +1026,47 @@ retry_flush_dents: err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; - goto retry_flush_dents; - } - - if (get_pages(sbi, F2FS_DIRTY_IMETA)) { - f2fs_unlock_all(sbi); - err = f2fs_sync_inode_meta(sbi); - if (err) - goto out; + cond_resched(); goto retry_flush_dents; } /* * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. + * until finishing nat/sit flush. inode->i_blocks can be updated. */ + down_write(&sbi->node_change); + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) + goto out; + cond_resched(); + goto retry_flush_dents; + } + retry_flush_nodes: down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -991,8 +1075,6 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - - build_free_nids(sbi); f2fs_unlock_all(sbi); } @@ -1003,7 +1085,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&sbi->nr_wb_bios)) + if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; io_schedule_timeout(5*HZ); @@ -1015,15 +1097,24 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); - if (cpc->reason == CP_UMOUNT) + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1039,15 +1130,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1059,19 +1149,16 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1090,18 +1177,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -1130,6 +1213,27 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + + /* Flush all the NAT BITS pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + } + } + /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) @@ -1179,7 +1283,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1189,7 +1293,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); @@ -1219,8 +1322,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1239,18 +1342,23 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { - f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); - f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); - f2fs_bug_on(sbi, prefree_segments(sbi)); - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); - unblock_operations(sbi); - goto out; + if (cpc->reason & CP_DISCARD) { + if (!exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + + if (NM_I(sbi)->dirty_nat_cnt == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } } /* @@ -1262,18 +1370,20 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi); + flush_nat_entries(sbi, cpc); flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - - f2fs_wait_all_discard_bio(sbi); + if (err) + release_discard_addrs(sbi); + else + clear_prefree_segments(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2c5ae0b41ada..16f86668fa4e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -30,14 +30,36 @@ #include #include +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + is_cold_data(page)) + return true; + return false; +} + static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + f2fs_show_injection_info(FAULT_IO); bio->bi_error = -EIO; + } #endif if (f2fs_bio_encrypted(bio)) { @@ -72,6 +94,18 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); + + if (IS_DUMMY_WRITTEN_PAGE(page)) { + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + mempool_free(page, sbi->write_io_dummy); + + if (unlikely(bio->bi_error)) + f2fs_stop_checkpoint(sbi, true); + continue; + } fscrypt_pullback_bio_page(&page, true); @@ -79,15 +113,57 @@ static void f2fs_write_end_io(struct bio *bio) mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } + dec_page_count(sbi, type); + clear_cold_data(page); end_page_writeback(page); } - if (atomic_dec_and_test(&sbi->nr_wb_bios) && + if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); } +/* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; +} + /* * Low-level block read/write IO operations. */ @@ -98,8 +174,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = f2fs_bio_alloc(npages); - bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; @@ -110,11 +185,46 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { - atomic_inc(&sbi->nr_wb_bios); - if (f2fs_sb_mounted_hmsmr(sbi->sb) && + unsigned int start; + + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); + + if (type != DATA && type != NODE) + goto submit_io; + + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; + start %= F2FS_IO_SIZE(sbi); + + if (start == 0) + goto submit_io; + + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); + + SetPagePrivate(page); + set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + lock_page(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } + /* + * In the NODE case, we lose next block address chain. So, we + * need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); } +submit_io: + if (is_read_io(bio_op(bio))) + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + else + trace_f2fs_submit_write_bio(sbi->sb, type, bio); submit_bio(bio); } @@ -125,19 +235,19 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->op)) - trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); - else - trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + if (is_read_io(fio->op)) + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); + else + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, - struct page *page, nid_t ino) +static bool __has_merged_page(struct f2fs_bio_info *io, + struct inode *inode, nid_t ino, pgoff_t idx) { struct bio_vec *bvec; struct page *target; @@ -146,7 +256,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!io->bio) return false; - if (!inode && !page && !ino) + if (!inode && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -156,10 +266,11 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, else target = fscrypt_control_page(bvec->bv_page); + if (idx != target->index) + continue; + if (inode && inode == target->mapping->host) return true; - if (page && page == target) - return true; if (ino && ino == ino_of_node(target)) return true; } @@ -168,72 +279,88 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) + nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - bool ret; + enum temp_type temp; + struct f2fs_bio_info *io; + bool ret = false; - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); - up_read(&io->io_rwsem); + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + io = sbi->write_io[btype] + temp; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, ino, idx); + up_read(&io->io_rwsem); + + /* TODO: use HOT temp only for meta pages now. */ + if (ret || btype == META) + break; + } return ret; } -static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io; - - io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, page, ino)) - goto out; - /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - if (test_opt(sbi, NOBARRIER)) - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; - else - io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META | - REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); -out: up_write(&io->io_rwsem); } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw) +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, bool force) { - __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); + enum temp_type temp; + + if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + return; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } } -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - if (has_merged_page(sbi, inode, page, ino, type)) - __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); + __submit_merged_write_cond(sbi, NULL, 0, 0, type, true); } -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + __submit_merged_write_cond(sbi, inode, ino, idx, type, false); +} + +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); } /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { @@ -254,41 +381,65 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } -void f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->op); + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; + int err = 0; - io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + f2fs_bug_on(sbi, is_read_io(fio->op)); + + down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out_fail; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); - down_write(&io->io_rwsem); + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + /* set submitted = 1 as a return value */ + fio->submitted = 1; + + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags))) + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(sbi); - + if ((fio->type == DATA || fio->type == NODE) && + fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { + err = -EAGAIN; + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + goto out_fail; + } io->bio = __bio_alloc(sbi, fio->new_blkaddr, - bio_blocks, is_read); + BIO_MAX_PAGES, false); io->fio = *fio; } - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } @@ -296,18 +447,74 @@ alloc_new: io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); + trace_f2fs_submit_page_write(fio->page, fio); + + if (fio->in_list) + goto next; +out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(fio->page, fio); + return err; +} + +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; } static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -335,14 +542,15 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); @@ -350,8 +558,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -393,7 +601,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei; + struct extent_info ei = {0,0,0}; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -410,18 +618,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -462,9 +660,7 @@ got_it: return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -484,7 +680,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC, false); + page = get_read_data_page(inode, index, 0, false); if (IS_ERR(page)) return page; @@ -510,7 +706,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC, for_write); + page = get_read_data_page(inode, index, 0, for_write); if (IS_ERR(page)) return page; @@ -591,29 +787,27 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - int seg = CURSEG_WARM_DATA; pgoff_t fofs; blkcnt_t count = 1; + int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) - seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, seg); + &sum, CURSEG_WARM_DATA, NULL, false); set_data_blkaddr(dn); /* update i_size */ @@ -625,11 +819,21 @@ alloc: return 0; } -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +static inline bool __force_buffered_io(struct inode *inode, int rw) +{ + return (f2fs_encrypted_file(inode) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; - ssize_t ret = 0; + int err = 0; + + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); @@ -641,19 +845,37 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + return f2fs_map_blocks(inode, &map, 1, + __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } if (!f2fs_has_inline_data(inode)) return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - return ret; + return err; +} + +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } } /* @@ -676,8 +898,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei; - bool allocated = false; + struct extent_info ei = {0,0,0}; block_t blkaddr; if (!maxblocks) @@ -699,7 +920,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -721,7 +942,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { @@ -736,14 +957,12 @@ next_block: } } else { err = __allocate_data_block(&dn); - if (!err) { + if (!err) set_inode_flag(inode, FI_APPEND_WRITE); - allocated = true; - } } if (err) goto sync_out; - map->m_flags = F2FS_MAP_NEW; + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { if (flag == F2FS_GET_BLOCK_BMAP) { @@ -794,7 +1013,6 @@ skip: err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; - allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -812,18 +1030,17 @@ skip: f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + __do_map_lock(sbi, flag, false); + f2fs_balance_fs(sbi, dn.node_changed); } - allocated = false; goto next_dnode; sync_out: f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + __do_map_lock(sbi, flag, false); + f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, err); @@ -835,19 +1052,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { struct f2fs_map_blocks map; - int ret; + int err; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; - ret = f2fs_map_blocks(inode, &map, create, flag); - if (!ret) { + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = map.m_len << inode->i_blkbits; + bh->b_size = (u64)map.m_len << inode->i_blkbits; } - return ret; + return err; } static int get_data_block(struct inode *inode, sector_t iblock, @@ -862,7 +1079,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -892,7 +1109,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head map_bh; sector_t start_blk, last_blk; pgoff_t next_pgofs; - loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; @@ -909,13 +1125,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); - isize = i_size_read(inode); - if (start >= isize) - goto out; - - if (start + len > isize) - len = isize - start; - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -934,13 +1143,11 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - /* Go through holes util pass the EOF */ - if (blk_to_logical(inode, start_blk) < isize) + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) goto prep_next; - /* Found a hole beyond isize means no more extents. - * Note that the premise is that filesystems don't - * punch holes beyond isize and keep size unchanged. - */ + flags |= FIEMAP_EXTENT_LAST; } @@ -978,37 +1185,6 @@ out: return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct block_device *bdev = sbi->sb->s_bdev; - struct bio *bio; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1037,9 +1213,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { - page = list_entry(pages->prev, struct page, lru); + page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, @@ -1073,7 +1250,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: @@ -1097,18 +1274,18 @@ got_it: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != block_nr - 1)) { + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1157,7 +1334,7 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; - struct page *page = list_entry(pages->prev, struct page, lru); + struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); @@ -1168,17 +1345,84 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_file(inode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (valid_ipu_blkaddr(fio)) { + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; + } + } + + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1187,57 +1431,57 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; - - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } - goto out_writepage; - } - } - - set_page_writeback(page); - +got_it: /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { - rewrite_data_page(fio); + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + f2fs_put_dnode(&dn); + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + return err; } + + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) +static int __write_data_page(struct page *page, bool *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1252,13 +1496,20 @@ static int f2fs_write_data_page(struct page *page, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .submitted = false, + .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (page->index < end_index) goto write; @@ -1272,8 +1523,6 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_SIZE); write: - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; /* we should not write 0'th page having journal header */ @@ -1290,6 +1539,7 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); goto done; } @@ -1298,52 +1548,78 @@ write: need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; - f2fs_lock_op(sbi); - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, page); - if (err == -EAGAIN) + if (!err) + goto out; + } + + if (err == -EAGAIN) { err = do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = do_write_data_page(&fio); + } + } if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; - clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); + submitted = NULL; } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, DATA, WRITE); + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, DATA); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; return 0; redirty_out: redirty_page_for_writepage(wbc, page); + if (!err) + return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_data_page(page, NULL, wbc, FS_DATA_IO); +} + /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1353,13 +1629,19 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; + pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; - int nwritten = 0; pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1393,6 +1675,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (page->index > end) { done = 1; @@ -1400,7 +1683,7 @@ retry: } done_index = page->index; - +retry_write: lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1426,17 +1709,37 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = mapping->a_ops->writepage(page, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, + HZ/50); + goto retry_write; + } + continue; + } done_index = page->index + 1; done = 1; break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1454,15 +1757,16 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (nwritten) - f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + 0, last_idx, DATA); return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1477,6 +1781,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1486,15 +1794,20 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. @@ -1509,14 +1822,26 @@ skip_write: return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); if (to > i_size) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); truncate_blocks(inode, i_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -1529,19 +1854,20 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err = 0; /* * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && + !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1555,7 +1881,7 @@ restart: set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) @@ -1577,7 +1903,8 @@ restart: err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1591,7 +1918,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } @@ -1630,7 +1957,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } repeat: - page = grab_cache_page_write_begin(mapping, index, flags); + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = pagecache_get_page(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (!page) { err = -ENOMEM; goto fail; @@ -1657,31 +1989,24 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + zero_user_segment(page, len, PAGE_SIZE); + return 0; + } + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; - goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1717,7 +2042,7 @@ static int f2fs_write_end(struct file *file, * let generic_perform_write() try to copy data again through copied=0. */ if (!PageUptodate(page)) { - if (unlikely(copied != PAGE_SIZE)) + if (unlikely(copied != len)) copied = 0; else SetPageUptodate(page); @@ -1726,7 +2051,6 @@ static int f2fs_write_end(struct file *file, goto unlock_out; set_page_dirty(page); - clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); @@ -1763,9 +2087,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (err) return err; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - if (test_opt(F2FS_I_SB(inode), LFS)) + if (__force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); @@ -1798,10 +2120,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } if (trace_android_fs_dataread_start_enabled() && @@ -1827,17 +2152,19 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, return; if (PageDirty(page)) { - if (inode->i_ino == F2FS_META_INO(sbi)) + if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); - else if (inode->i_ino == F2FS_NODE_INO(sbi)) + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); - else + } else { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } } /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); @@ -1897,7 +2224,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - if (f2fs_is_atomic_file(inode)) { + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { register_inmem_page(inode, page); return 1; @@ -1944,8 +2271,12 @@ int f2fs_migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written && !mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; + if (atomic_written) { + if (mode != MIGRATE_SYNC) + return -EBUSY; + if (!mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + } /* * A reference is expected if PagePrivate set when move mapping, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 687998e9557c..87f449845f5f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,7 +50,27 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_bios = atomic_read(&sbi->nr_wb_bios); + si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -61,6 +81,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); @@ -74,7 +96,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; + si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -87,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -112,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -144,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; @@ -181,6 +209,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); get_cache: si->cache_mem = 0; @@ -190,11 +222,18 @@ get_cache: si->cache_mem += sizeof(struct f2fs_gc_kthread); /* build merge flush thread */ - if (SM_I(sbi)->cmd_control_info) + if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) { + si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ - si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + + NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); @@ -250,8 +289,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Orphan Inode: %u\n", - si->orphans); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -310,8 +349,16 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", - si->inmem_pages, si->wb_bios); + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -324,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d\n", - si->fnids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -410,6 +457,11 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); + atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); + atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); + mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 11f3717ce481..c0c933ad43c8 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,29 +128,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - if (de->hash_code != namehash) - goto not_match; - - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (unlikely(!name->name)) { - if (fname->usr_fname->name[0] == '_') { - if (de_name.len > 32 && - !memcmp(de_name.name + ((de_name.len - 17) & ~15), - fname->crypto_buf.name + 8, 16)) - goto found; - goto not_match; - } - name->name = fname->crypto_buf.name; - name->len = fname->crypto_buf.len; - } -#endif - if (de_name.len == name->len && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; -not_match: + if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; @@ -212,13 +192,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - /* This is to increase the speed of f2fs_create */ - if (!de && room) { - F2FS_I(dir)->task = current; - if (F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; - } + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; } return de; @@ -259,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } @@ -277,7 +256,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { - *res_page = ERR_PTR(err); + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); return NULL; } @@ -322,7 +304,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -339,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -386,7 +350,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -440,15 +404,19 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { - file_lost_pino(inode); + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. @@ -475,7 +443,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); @@ -554,8 +522,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; + } #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; @@ -599,11 +569,9 @@ add_dentry: err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -737,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -750,7 +720,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, @@ -760,17 +730,23 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); + remove_dirty_inode(dir); } f2fs_put_page(page, 1); } @@ -813,7 +789,7 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; @@ -848,7 +824,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return true; + return err; de_name = *fstr; fstr->len = save_len; @@ -856,12 +832,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) - return true; + return 1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return false; + return 0; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) @@ -900,17 +876,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; continue; - else + } else { goto out; + } } dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; @@ -920,10 +900,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - err = 0; out: fscrypt_fname_free_buffer(&fstr); - return err; + return err < 0 ? err : 0; } static int f2fs_dir_open(struct inode *inode, struct file *filp) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7b32ce979fe1..ff2352a0ed15 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,179 @@ #include "node.h" #include +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -77,7 +250,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) struct extent_tree *et; nid_t ino = inode->i_ino; - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); @@ -94,7 +267,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) atomic_dec(&sbi->total_zombie_tree); list_del_init(&et->list); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); /* never died until evict_inode */ F2FS_I(inode)->extent_tree = et; @@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -172,12 +315,12 @@ static void __drop_largest_extent(struct inode *inode, if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } /* return true, if inode page is changed */ -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; @@ -215,6 +358,16 @@ out: return false; } +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + bool ret = __f2fs_init_extent_tree(inode, i_ext); + + if (!F2FS_I(inode)->extent_tree) + set_inode_flag(inode, FI_NO_EXTENT); + + return ret; +} + static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { @@ -237,17 +390,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,87 +416,6 @@ out: return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -391,17 +470,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -417,7 +486,7 @@ do_insert: return en; } -static unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static void f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -430,7 +499,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int pos = (unsigned int)fofs; if (!et) - return false; + return; trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); @@ -438,7 +507,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); - return false; + return; } prev = et->largest; @@ -451,8 +520,11 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, - &insert_p, &insert_parent); + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false); if (!en) en = next_en; @@ -493,9 +565,8 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!next_en) { struct rb_node *node = rb_next(&en->rb_node); - next_en = node ? - rb_entry(node, struct extent_node, rb_node) - : NULL; + next_en = rb_entry_safe(node, struct extent_node, + rb_node); } if (parts) @@ -536,8 +607,6 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __free_extent_tree(sbi, et); write_unlock(&et->lock); - - return !__is_extent_same(&prev, &et->largest); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) @@ -553,7 +622,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!atomic_read(&sbi->total_zombie_tree)) goto free_node; - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ @@ -575,11 +644,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) goto unlock_out; cond_resched(); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); @@ -609,7 +678,7 @@ free_node: spin_unlock(&sbi->extent_lock); unlock_out: - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); @@ -656,10 +725,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); list_add_tail(&et->list, &sbi->zombie_list); atomic_inc(&sbi->total_zombie_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); return; } @@ -667,12 +736,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) node_cnt = f2fs_destroy_extent_node(inode); /* delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); atomic_dec(&sbi->total_ext_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -719,7 +788,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, void init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); + mutex_init(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); atomic_set(&sbi->total_ext_tree, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 88e111ab068b..ad16c1dd416f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,10 +19,16 @@ #include #include #include +#include #include #include #include -#include +#include +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#include +#else +#include +#endif #include #ifdef CONFIG_F2FS_CHECK_FS @@ -46,6 +52,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, @@ -58,7 +65,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -83,10 +90,14 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -102,15 +113,19 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_HMSMR 0x0002 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_SET_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) #define F2FS_CLEAR_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) /* * For checkpoint manager @@ -120,19 +135,22 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 -#define DEF_BATCHED_TRIM_SECTIONS 2 +#define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -174,18 +192,73 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; -struct bio_entry { - struct list_head list; - struct bio *bio; - struct completion event; - int error; +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + +enum { + D_PREP, + D_SUBMIT, + D_DONE, +}; + +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + +struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; + struct list_head list; /* command list */ + struct completion wait; /* compleation */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + int error; /* bio error */ +}; + +struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ + struct list_head wait_list; /* store on-flushing entries */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ + struct mutex cmd_lock; + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -196,13 +269,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -210,6 +283,7 @@ struct fsync_inode_entry { static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } @@ -217,6 +291,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } @@ -242,11 +317,17 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -271,6 +352,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_gc_range { + u32 sync; + u64 start; + u64 len; +}; + struct f2fs_defragment { u64 start; u64 len; @@ -283,36 +370,68 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, void *t) +{ + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + + d->inode = inode; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -344,16 +463,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; @@ -387,12 +520,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -401,6 +535,7 @@ struct f2fs_map_blocks { #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -413,6 +548,8 @@ struct f2fs_map_blocks { #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define DEF_DIR_LEVEL 0 @@ -432,16 +569,28 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ - unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -468,11 +617,22 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } -static inline bool __is_extent_same(struct extent_info *ei1, - struct extent_info *ei2) +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) { - return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && - ei1->len == ei2->len); + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); } static inline bool __is_extent_mergeable(struct extent_info *back, @@ -494,20 +654,26 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *); +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } +enum nid_list { + FREE_NID_LIST, + ALLOC_NID_LIST, + MAX_NID_LIST, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t available_nids; /* maximum available node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ @@ -520,16 +686,28 @@ struct f2fs_nm_info { struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ + unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ + unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif int bitmap_size; /* bitmap size */ }; @@ -586,7 +764,6 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, - CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { @@ -598,7 +775,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -621,12 +799,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for small discard management */ - struct list_head discard_list; /* 4KB discard list */ - struct list_head wait_list; /* linked with issued discard bio */ - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ - /* for batched trimming */ unsigned int trim_sections; /* # of sections to trim */ @@ -635,10 +807,13 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ - struct flush_cmd_control *cmd_control_info; + struct flush_cmd_control *fcc_info; + /* for discard command control */ + struct discard_cmd_control *dcc_info; }; /* @@ -650,6 +825,7 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -657,6 +833,8 @@ enum count_type { F2FS_DIRTY_META, F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, NR_COUNT_TYPE, }; @@ -680,29 +858,82 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, }; +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ - int op_flags; /* rq_flag_bits */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct list_head list; /* serialize IOs */ + bool submitted; /* indicate IO submission */ + int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ struct rw_semaphore io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ +}; + +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif }; enum inode_type { @@ -736,10 +967,6 @@ enum { MAX_TIME, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 -#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -747,10 +974,11 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ -#ifdef CONFIG_F2FS_FS_ENCRYPTION - u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ #endif + /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -759,9 +987,11 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info read_io; /* for read bios */ - struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; + /* bio ordering for NODE/DATA */ + int write_io_size_bits; /* Write IO size bits */ + mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -771,6 +1001,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -786,7 +1017,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ @@ -816,14 +1047,18 @@ struct f2fs_sb_info { block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + u32 s_next_generation; /* for NFS support */ - atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; @@ -856,18 +1091,28 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ + atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; /* For shrinker support */ struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -878,13 +1123,26 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(type) \ + printk("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, fault_name[type], \ + __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { struct f2fs_fault_info *ffi = &sbi->fault_info; @@ -898,10 +1156,6 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); return true; } return false; @@ -912,8 +1166,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -969,6 +1223,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -1069,6 +1344,12 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); @@ -1092,9 +1373,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) @@ -1108,16 +1391,34 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + unsigned long flags; - return blk_queue_discard(q); + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + kfree(NM_I(sbi)->nat_bits); + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1125,6 +1426,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) down_read(&sbi->cp_rwsem); } +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + return down_read_trylock(&sbi->cp_rwsem); +} + static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { up_read(&sbi->cp_rwsem); @@ -1153,7 +1459,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) @@ -1174,17 +1480,14 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 - /* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { - if (F2FS_I(inode)->i_xattr_nid) - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; - else - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1192,15 +1495,24 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); -static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { - blkcnt_t diff; + blkcnt_t diff = 0, release = 0; + block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_BLOCK)) - return false; + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + release = *count; + goto enospc; + } #endif /* * let's increase this in prior to actual block count change in order @@ -1210,39 +1522,50 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { - diff = sbi->total_valid_block_count - sbi->user_block_count; + avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; - sbi->total_valid_block_count = sbi->user_block_count; + release = diff; + sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); - return false; + goto enospc; } } spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, *count, true); - return true; + if (release) + dquot_release_reservation_block(inode, release); + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + dquot_release_reservation_block(inode, release); + return -ENOSPC; } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, - blkcnt_t count) + block_t count) { + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(sbi, inode->i_blocks < count); + f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, count, false); + f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -1363,51 +1686,70 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } -static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; + bool quota = inode && !is_inode; + + if (quota) { + int ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + } spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count > sbi->user_block_count)) { + if (unlikely(valid_block_count + sbi->reserved_blocks > + sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } - if (inode) - f2fs_i_blocks_write(inode, 1, true); - sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + percpu_counter_inc(&sbi->alloc_valid_block_count); - return true; + return 0; + +enospc: + if (quota) + dquot_release_reservation_block(inode, 1); + return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, !sbi->total_valid_block_count); f2fs_bug_on(sbi, !sbi->total_valid_node_count); - f2fs_bug_on(sbi, !inode->i_blocks); + f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); + + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false, true); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) @@ -1435,11 +1777,14 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, { #ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page = find_lock_page(mapping, index); + if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); return NULL; + } #endif if (!for_write) return grab_cache_page(mapping, index); @@ -1518,22 +1863,42 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, static inline bool IS_INODE(struct page *page) { struct f2fs_node *p = F2FS_NODE(page); + return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); + raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1596,6 +1961,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1614,6 +1993,7 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -1621,6 +2001,10 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1634,7 +2018,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1661,7 +2045,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) @@ -1670,18 +2054,26 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) inc_nlink(inode); else drop_nlink(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, - blkcnt_t diff, bool add) + block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); - inode->i_blocks = add ? inode->i_blocks + diff : - inode->i_blocks - diff; - f2fs_mark_inode_dirty_sync(inode); + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1695,34 +2087,27 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode) -{ - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) - return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); -} - static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) @@ -1739,6 +2124,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -1755,6 +2142,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -1765,13 +2159,14 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1789,12 +2184,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -1810,6 +2199,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } +static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); +} + static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); @@ -1825,10 +2219,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); - return (void *)&(ri->i_addr[1]); + int extra_size = get_extra_isize(inode); + + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -1850,13 +2246,31 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & PAGE_MASK) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1893,13 +2307,15 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(FAULT_KMALLOC); return NULL; + } #endif return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +static inline void *kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -1909,7 +2325,7 @@ static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) return ret; } -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +static inline void *kvzalloc(size_t size, gfp_t flags) { void *ret; @@ -1919,42 +2335,89 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) return ret; } +static inline int wbc_to_write_flags(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return REQ_SYNC; + else if (wbc->for_kupdate || wbc->for_background) + return 0; + + return 0; +} + +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} /* * file.c */ -int f2fs_sync_file(struct file *, loff_t, loff_t, int); -void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *); -int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -int f2fs_setattr(struct dentry *, struct iattr *); -int truncate_hole(struct inode *, pgoff_t, pgoff_t); -int truncate_data_blocks_range(struct dnode_of_data *, int); -long f2fs_ioctl(struct file *, unsigned int, unsigned long); -long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void truncate_data_blocks(struct dnode_of_data *dn); +int truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * inode.c */ -void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget(struct super_block *, unsigned long); -struct inode *f2fs_iget_retry(struct super_block *, unsigned long); -int try_to_free_nats(struct f2fs_sb_info *, int); -int update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); -int f2fs_write_inode(struct inode *, struct writeback_control *); -void f2fs_evict_inode(struct inode *); -void handle_failed_inode(struct inode *); +void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +int update_inode(struct inode *inode, struct page *node_page); +int update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void handle_failed_inode(struct inode *inode); /* * namei.c @@ -1964,40 +2427,45 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *, umode_t); -unsigned char get_de_type(struct f2fs_dir_entry *); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, - f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct fscrypt_str *); -void do_make_empty_dir(struct inode *, struct inode *, - struct f2fs_dentry_ptr *); -struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, const struct qstr *, struct page *); -void update_parent_metadata(struct inode *, struct inode *, unsigned int); -int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *); -struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, - struct page **); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, - struct page **); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); -void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, - struct page *, struct inode *); -int update_dent_inode(struct inode *, struct inode *, const struct qstr *); -void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t , unsigned int); -int f2fs_add_regular_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, - nid_t, umode_t); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, - umode_t); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, - struct inode *); -int f2fs_do_tmpfile(struct inode *, struct inode *); -bool f2fs_empty_dir(struct inode *); +void set_de_type(struct f2fs_dir_entry *de, umode_t mode); +unsigned char get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, + const struct qstr *orig_name, struct page *dpage); +void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { @@ -2008,12 +2476,14 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *); -void f2fs_inode_synced(struct inode *); -int f2fs_commit_super(struct f2fs_sb_info *, bool); -int f2fs_sync_fs(struct super_block *, int); +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) -void f2fs_msg(struct super_block *, const char *, const char *, ...); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* @@ -2028,160 +2498,190 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, struct dnode_of_data; struct node_info; -bool available_free_memory(struct f2fs_sb_info *, int); -int need_dentry_mark(struct f2fs_sb_info *, nid_t); -bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool need_inode_block_update(struct f2fs_sb_info *, nid_t); -void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); -pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); -int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); -int truncate_inode_blocks(struct inode *, pgoff_t); -int truncate_xattr_node(struct inode *, struct page *); -int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *); -struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); -void ra_node_page(struct f2fs_sb_info *, nid_t); -struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_node_page_ra(struct page *, int); -void move_node_page(struct page *, int); -int fsync_node_pages(struct f2fs_sb_info *, struct inode *, - struct writeback_control *, bool); -int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *); -bool alloc_nid(struct f2fs_sb_info *, nid_t *); -void alloc_nid_done(struct f2fs_sb_info *, nid_t); -void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -int try_to_free_nids(struct f2fs_sb_info *, int); -void recover_inline_xattr(struct inode *, struct page *); -void recover_xattr_data(struct inode *, struct page *, block_t); -int recover_inode_page(struct f2fs_sb_info *, struct page *); -int restore_node_summary(struct f2fs_sb_info *, unsigned int, - struct f2fs_summary_block *); -void flush_nat_entries(struct f2fs_sb_info *); -int build_node_manager(struct f2fs_sb_info *); -void destroy_node_manager(struct f2fs_sb_info *); +bool available_free_memory(struct f2fs_sb_info *sbi, int type); +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int truncate_inode_blocks(struct inode *inode, pgoff_t from); +int truncate_xattr_node(struct inode *inode, struct page *page); +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int remove_inode_page(struct inode *inode); +struct page *new_inode_page(struct inode *inode); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *get_node_page_ra(struct page *parent, int start); +void move_node_page(struct page *node_page, int gc_type); +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void recover_inline_xattr(struct inode *inode, struct page *page); +int recover_xattr_data(struct inode *inode, struct page *page, + block_t blkaddr); +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_node_manager(struct f2fs_sb_info *sbi); +void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* * segment.c */ -void register_inmem_page(struct inode *, struct page *); -void drop_inmem_pages(struct inode *); -int commit_inmem_pages(struct inode *); -void f2fs_balance_fs(struct f2fs_sb_info *, bool); -void f2fs_balance_fs_bg(struct f2fs_sb_info *); -int f2fs_issue_flush(struct f2fs_sb_info *); -int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *); -void invalidate_blocks(struct f2fs_sb_info *, block_t); -bool is_checkpointed_data(struct f2fs_sb_info *, block_t); -void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); -void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); -void release_discard_addrs(struct f2fs_sb_info *); -int npages_for_summary_flush(struct f2fs_sb_info *, bool); -void allocate_new_segments(struct f2fs_sb_info *); -int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); -struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -void update_meta_page(struct f2fs_sb_info *, void *, block_t); -void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(unsigned int, struct f2fs_io_info *); -void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); -void rewrite_data_page(struct f2fs_io_info *); -void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, - block_t, block_t, bool, bool); -void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool, bool); -void allocate_data_block(struct f2fs_sb_info *, struct page *, - block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); -void write_data_summaries(struct f2fs_sb_info *, block_t); -void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); -int build_segment_manager(struct f2fs_sb_info *); -void destroy_segment_manager(struct f2fs_sb_info *); +bool need_SSR(struct f2fs_sb_info *sbi); +void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); +int commit_inmem_pages(struct inode *inode); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int create_flush_cmd_control(struct f2fs_sb_info *sbi); +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void stop_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void release_discard_addrs(struct f2fs_sb_info *sbi); +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); +void write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_segment_manager(struct f2fs_sb_info *sbi); +void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); -struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); -bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); -void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); -long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *, bool); -bool exist_written_data(struct f2fs_sb_info *, nid_t, int); -int f2fs_sync_inode_meta(struct f2fs_sb_info *); -int acquire_orphan_inode(struct f2fs_sb_info *); -void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct inode *); -void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); -int get_valid_checkpoint(struct f2fs_sb_info *); -void update_dirty_page(struct inode *, struct page *); -void remove_dirty_inode(struct inode *); -int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); -int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); -void init_ino_entry_info(struct f2fs_sb_info *); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type); +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int acquire_orphan_inode(struct f2fs_sb_info *sbi); +void release_orphan_inode(struct f2fs_sb_info *sbi); +void add_orphan_inode(struct inode *inode); +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int recover_orphan_inodes(struct f2fs_sb_info *sbi); +int get_valid_checkpoint(struct f2fs_sb_info *sbi); +void update_dirty_page(struct inode *inode, struct page *page); +void remove_dirty_inode(struct inode *inode); +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void init_ino_entry_info(struct f2fs_sb_info *sbi); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, - struct page *, nid_t, enum page_type, int); -void f2fs_flush_merged_bios(struct f2fs_sb_info *); -int f2fs_submit_page_bio(struct f2fs_io_info *); -void f2fs_submit_page_mbio(struct f2fs_io_info *); -void set_data_blkaddr(struct dnode_of_data *); -void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); -int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); -int reserve_new_block(struct dnode_of_data *); -int f2fs_get_block(struct dnode_of_data *, pgoff_t); -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); -int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); -struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t, bool); -struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct f2fs_io_info *); -int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); -int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void f2fs_set_page_dirty_nobuffers(struct page *); -void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); -int f2fs_release_page(struct page *, gfp_t); +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_submit_page_write(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write); +struct page *find_data_page(struct inode *inode, pgoff_t index); +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length); +int f2fs_release_page(struct page *page, gfp_t wait); #ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *, struct page *, struct page *, - enum migrate_mode); +int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode); #endif /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *); -void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool); -void build_gc_manager(struct f2fs_sb_info *); +int start_gc_thread(struct f2fs_sb_info *sbi); +void stop_gc_thread(struct f2fs_sb_info *sbi); +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); +void build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *, bool); -bool space_for_roll_forward(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -2198,10 +2698,15 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, fnids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; - int bg_gc, wb_bios; - int inline_xattr, inline_inode, inline_dir, orphans; + int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2273,11 +2778,33 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) +#define stat_inc_atomic_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) +#define stat_dec_atomic_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2287,14 +2814,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2302,37 +2829,43 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) -int f2fs_build_stats(struct f2fs_sb_info *); -void f2fs_destroy_stats(struct f2fs_sb_info *); +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -2355,52 +2888,78 @@ extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline_data(struct inode *); -bool f2fs_may_inline_dentry(struct inode *); -void read_inline_data(struct page *, struct page *); -bool truncate_inline_inode(struct page *, u64); -int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); -int f2fs_convert_inline_inode(struct inode *); -int f2fs_write_inline_data(struct inode *, struct page *); -bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct fscrypt_name *, struct page **); -int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, - struct inode *, struct inode *); -bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct fscrypt_str *); -int f2fs_inline_data_fiemap(struct inode *, - struct fiemap_extent_info *, __u64, __u64); +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void read_inline_data(struct page *page, struct page *ipage); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +bool recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); /* * shrinker.c */ -unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); -unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); -void f2fs_join_shrinker(struct f2fs_sb_info *); -void f2fs_leave_shrinker(struct f2fs_sb_info *); +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); -void f2fs_drop_extent_tree(struct inode *); -unsigned int f2fs_destroy_extent_node(struct inode *); -void f2fs_destroy_extent_tree(struct inode *); -bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); -void f2fs_update_extent_cache(struct dnode_of_data *); +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_drop_extent_tree(struct inode *inode); +unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t, block_t, unsigned int); -void init_extent_cache_info(struct f2fs_sb_info *); + pgoff_t fofs, block_t blkaddr, unsigned int len); +void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); +/* + * sysfs.c + */ +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); + /* * crypto support */ @@ -2409,6 +2968,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -2426,9 +2990,45 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) { - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); +} + +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; +} +#endif + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) @@ -2457,28 +3057,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -#ifndef CONFIG_F2FS_FS_ENCRYPTION -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk -#endif #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 801111e1f8ef..c29d723ae337 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,19 @@ #include "trace.h" #include +static int f2fs_filemap_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&F2FS_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&F2FS_I(inode)->i_mmap_sem); + + return err; +} + static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -59,13 +73,14 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_balance_fs(sbi, dn.node_changed); file_update_time(vma->vm_file); + down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || !PageUptodate(page))) { unlock_page(page); err = -EFAULT; - goto out; + goto out_sem; } /* @@ -85,17 +100,19 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); - /* if gced page is attached, don't write to cold segment */ - clear_cold_data(page); +out_sem: + up_read(&F2FS_I(inode)->i_mmap_sem); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -103,7 +120,7 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, + .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, }; @@ -118,11 +135,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; @@ -143,8 +155,6 @@ static inline bool need_do_checkpoint(struct inode *inode) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; else if (test_opt(sbi, FASTBOOT)) need_cp = true; else if (sbi->active_logs == 2) @@ -170,7 +180,6 @@ static void try_to_fix_pino(struct inode *inode) nid_t pino; down_write(&fi->i_sem); - fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); @@ -210,7 +219,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync && !f2fs_skip_inode_update(inode)) { + if (!f2fs_skip_inode_update(inode, datasync)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -264,14 +273,24 @@ sync_nodes: } if (need_inode_block_update(sbi, ino)) { - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); @@ -279,7 +298,8 @@ sync_nodes: flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(sbi); + if (!atomic) + ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -376,7 +396,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -424,14 +445,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!f2fs_encrypted_inode(inode)) - return -ENOKEY; - } - /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -444,11 +457,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - int ret = generic_file_open(inode, filp); struct dentry *dir; - if (!ret && f2fs_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); + if (f2fs_encrypted_inode(inode)) { + int ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; if (!fscrypt_has_encryption_key(inode)) @@ -461,7 +473,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return -EPERM; } dput(dir); - return ret; + return dquot_file_open(inode, filp); } int truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -470,9 +482,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -532,12 +548,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || - !S_ISREG(inode->i_mode)) + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -570,8 +588,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - if (truncate_inline_inode(ipage, from)) - set_page_dirty(ipage); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; @@ -620,6 +637,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -632,16 +655,15 @@ int f2fs_truncate(struct inode *inode) return err; inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); return 0; } int f2fs_getattr(struct vfsmount *mnt, - struct dentry *dentry, struct kstat *stat) + struct dentry *dentry, struct kstat *stat) { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); - stat->blocks <<= 3; return 0; } @@ -679,28 +701,50 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; + bool size_changed = false; err = setattr_prepare(dentry, attr); if (err) return err; - if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; + if (is_quota_modification(inode, attr)) { + err = dquot_initialize(inode); + if (err) + return err; + } + if ((attr->ia_valid & ATTR_UID && + !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && + !gid_eq(attr->ia_gid, inode->i_gid))) { + err = dquot_transfer(inode, attr); + if (err) + return err; + } - if (attr->ia_size <= i_size_read(inode)) { - truncate_setsize(inode, attr->ia_size); - err = f2fs_truncate(inode); + if (attr->ia_valid & ATTR_SIZE) { + if (f2fs_encrypted_inode(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + + if (attr->ia_size <= i_size_read(inode)) { + down_write(&F2FS_I(inode)->i_mmap_sem); + truncate_setsize(inode, attr->ia_size); + err = f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); + up_write(&F2FS_I(inode)->i_mmap_sem); /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { @@ -710,6 +754,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } inode->i_mtime = inode->i_ctime = current_time(inode); } + + size_changed = true; } __setattr_copy(inode, attr); @@ -722,7 +768,12 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - f2fs_mark_inode_dirty_sync(inode); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, size_changed); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + return err; } @@ -836,12 +887,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -872,7 +925,8 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -948,15 +1002,15 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { f2fs_i_blocks_write(src_inode, - 1, false); + 1, false, false); f2fs_i_blocks_write(dst_inode, - 1, true); + 1, true, false); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, blkaddr[i], ni.version, true, false); @@ -1008,11 +1062,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1080,16 +1134,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - return ret; + goto out; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1102,6 +1157,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1115,7 +1172,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1126,8 +1184,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. @@ -1166,9 +1224,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - return ret; + goto out_sem; truncate_pagecache_range(inode, offset, offset + len - 1); @@ -1182,17 +1241,15 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - return ret; + goto out_sem; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1218,6 +1275,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + if (ret) goto out; @@ -1238,6 +1298,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, out: if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); +out_sem: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1250,8 +1312,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; @@ -1266,14 +1329,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) - return ret; + goto out; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); @@ -1302,6 +1366,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1313,15 +1379,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; - int ret; + int err; - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) - return ret; + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; f2fs_balance_fs(sbi, true); @@ -1333,12 +1399,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (ret) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { pgoff_t last_off; if (!map.m_len) - return ret; + return err; last_off = map.m_lblk + map.m_len - 1; @@ -1352,7 +1418,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); - return ret; + return err; } static long f2fs_fallocate(struct file *file, int mode, @@ -1393,7 +1459,9 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1419,6 +1487,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1426,17 +1495,20 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +static int f2fs_file_flush(struct file *file, fl_owner_t id) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; } static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) @@ -1465,28 +1537,34 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + ret = -EPERM; + goto unlock_out; + } + + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - inode_unlock(inode); ret = -EPERM; - goto out; + goto unlock_out; } } flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); -out: + f2fs_mark_inode_dirty_sync(inode, false); +unlock_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1506,6 +1584,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1520,17 +1601,26 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); + goto out; + } + +inc_stat: + F2FS_I(inode)->inmem_task = current; + stat_inc_atomic_write(inode); + stat_update_max_atomic_write(inode); out: inode_unlock(inode); mnt_drop_write_file(filp); @@ -1555,15 +1645,19 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) goto err_out; if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); - if (ret) { - set_inode_flag(inode, FI_ATOMIC_FILE); + if (ret) goto err_out; - } - } - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + if (!ret) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); + stat_dec_atomic_write(inode); + } + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); + } err_out: inode_unlock(inode); mnt_drop_write_file(filp); @@ -1578,6 +1672,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1591,6 +1688,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1646,6 +1746,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } @@ -1691,7 +1792,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -1752,31 +1853,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1842,7 +1928,51 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_range range; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) + return -EINVAL; +do_more: + if (!range.sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + range.start += sbi->blocks_per_seg; + if (range.start <= end) + goto do_more; out: mnt_drop_write_file(filp); return ret; @@ -1876,17 +2006,16 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; - struct extent_info ei; + struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -1920,7 +2049,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -1944,7 +2073,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -1962,7 +2091,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -2021,42 +2150,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) + return -EFAULT; + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) return -EINVAL; err = mnt_want_write_file(filp); if (err) return err; - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } - - if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } - - /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } - err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, @@ -2190,6 +2317,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) @@ -2199,6 +2328,79 @@ err_out: return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2230,12 +2432,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); default: return -ENOTTY; } @@ -2248,20 +2456,26 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; - if (f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) { - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); + int err; + + if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + + err = f2fs_preallocate_blocks(iocb, from); + if (err) { + inode_unlock(inode); + return err; } + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); + clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); @@ -2293,10 +2507,12 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: break; default: return -ENOIOCTLCMD; @@ -2312,6 +2528,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 34a69e7ed90b..bfe6a8ccc3a0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,17 +28,23 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; + set_freezable(); do { + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, + msecs_to_jiffies(wait_ms)); + + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; - else - wait_event_interruptible_timeout(*wq, - kthread_should_stop(), - msecs_to_jiffies(wait_ms)); if (kthread_should_stop()) break; @@ -48,10 +54,15 @@ static int gc_thread_func(void *data) } #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -66,23 +77,28 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; + + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -90,6 +106,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; @@ -107,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -170,7 +191,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -180,7 +205,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -205,7 +230,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) continue; clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -213,8 +238,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -223,7 +248,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -242,6 +267,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } +static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int valid_blocks = + get_valid_blocks(sbi, segno, true); + + return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + valid_blocks * 2 : valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -250,7 +285,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + return get_greedy_cost(sbi, segno); else return get_cb_cost(sbi, segno); } @@ -279,6 +314,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -292,10 +328,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -308,9 +352,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -327,7 +372,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; @@ -345,17 +390,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -538,12 +584,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) @@ -551,14 +599,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_READ, - .op_flags = READ_SYNC, + .op_flags = 0, .encrypted_page = NULL, + .in_list = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -572,6 +627,12 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!page) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + + if (f2fs_is_atomic_file(inode)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -596,7 +657,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA); + &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -632,9 +693,11 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) f2fs_wait_on_page_writeback(dn.node_page, NODE, true); fio.op = REQ_OP_WRITE; - fio.op_flags = WRITE_SYNC; + fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -652,7 +715,8 @@ out: f2fs_put_page(page, 1); } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { struct page *page; @@ -660,6 +724,12 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) if (IS_ERR(page)) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + + if (f2fs_is_atomic_file(inode)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -669,10 +739,14 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; @@ -680,8 +754,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } set_cold_data(page); @@ -690,8 +766,6 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - - clear_cold_data(page); } out: f2fs_put_page(page, 1); @@ -761,8 +835,7 @@ next_step: continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -796,14 +869,18 @@ next_step: continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx); + if (f2fs_encrypted_file(inode)) + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -840,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -864,7 +941,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -879,7 +956,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -887,87 +963,113 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } if (gc_type == FG_GC) - f2fs_submit_merged_bio(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret = -EINVAL; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: - segno = NULL_SEGNO; - - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + ret = -EINVAL; goto stop; + } if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { - gc_type = FG_GC; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { /* - * If there is no victim and no prefree segment but still not - * enough free sections, we should flush dent/node blocks and do - * garbage collections. + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. */ - if (__get_victim(sbi, &segno, gc_type) || - prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - segno = NULL_SEGNO; - } else if (has_not_enough_free_secs(sbi, 0, 0)) { + if (prefree_segments(sbi)) { ret = write_checkpoint(sbi, &cpc); if (ret) goto stop; } + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; } - if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && !background) { + ret = -EINVAL; goto stop; - ret = 0; + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; + goto stop; + } - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); @@ -979,7 +1081,7 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count, blocks_per_sec; + u64 main_count, resv_count, ovp_count; DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -987,8 +1089,12 @@ void build_gc_manager(struct f2fs_sb_info *sbi) main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; - sbi->fggc_threshold = div_u64((main_count - ovp_count) * blocks_per_sec, - (main_count - resv_count)); + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { @@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e14edc9d389a..9fb5fdc61888 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -23,10 +23,10 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; @@ -45,6 +45,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -52,31 +53,33 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) - return false; + if (from >= MAX_INLINE_DATA(inode)) + return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -129,9 +132,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .sbi = F2FS_I_SB(dn->inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; @@ -153,20 +157,23 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); - if (dirty) + if (dirty) { inode_dec_dirty_pages(dn->inode); + remove_dirty_inode(dn->inode); + } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -213,6 +220,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -229,11 +238,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -268,9 +282,9 @@ process_inline: f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -283,9 +297,8 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -299,11 +312,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -314,9 +327,9 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -330,19 +343,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *dentry_blk; struct f2fs_dentry_ptr d; + void *inline_dentry; - dentry_blk = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -351,11 +364,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -370,25 +384,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -396,7 +409,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -409,14 +422,13 @@ out: return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -437,7 +449,7 @@ static int f2fs_add_inline_entries(struct inode *dir, } new_name.name = d.filename[bit_pos]; - new_name.len = de->name_len; + new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; @@ -458,20 +470,20 @@ punch_dentry_pages: } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -487,9 +499,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -498,7 +510,7 @@ recover: } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -514,7 +526,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *dentry_blk = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -524,11 +536,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - dentry_blk = inline_data_addr(ipage); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { - err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { + err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; @@ -543,14 +556,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -573,7 +583,8 @@ out: void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -581,17 +592,18 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); @@ -602,20 +614,21 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *dentry_blk; + void *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - dentry_blk = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -625,26 +638,30 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; + int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); - if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) - ctx->pos = NR_INLINE_DENTRY; + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) + ctx->pos = d.max; f2fs_put_page(ipage, 1); - return 0; + return err < 0 ? err : 0; } int f2fs_inline_data_fiemap(struct inode *inode, @@ -666,7 +683,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -675,7 +692,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d7369895a78a..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,13 +16,15 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include -void f2fs_mark_inode_dirty_sync(struct inode *inode) +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { - if (f2fs_inode_dirtied(inode)) + if (f2fs_inode_dirtied(inode, sync)) return; + mark_inode_dirty_sync(inode); } @@ -43,25 +45,26 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -70,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { @@ -103,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -129,7 +206,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = le64_to_cpu(ri->i_blocks); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -152,6 +229,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -165,6 +245,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -225,6 +315,7 @@ make_now: ret = -EIO; goto bad_inode; } + f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -252,6 +343,7 @@ retry: int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); @@ -265,13 +357,15 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(inode->i_blocks); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); - if (F2FS_I(inode)->extent_tree) - set_raw_extent(&F2FS_I(inode)->extent_tree->largest, - &ri->i_ext); - else + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); @@ -287,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -312,7 +420,6 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -335,7 +442,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode)) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -368,10 +476,10 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) - goto no_delete; -#endif + dquot_initialize(inode); + + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); @@ -380,10 +488,18 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; } /* give more chances, if ENOMEM case */ @@ -394,25 +510,37 @@ retry: if (err) update_inode_page(inode); + dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: + dquot_drop(inode); + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); + } else { + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); } - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); @@ -424,6 +552,19 @@ void handle_failed_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + update_inode_page(inode); + f2fs_inode_synced(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 08d7dc99042e..87440ab5e51b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -42,6 +43,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } f2fs_unlock_op(sbi); + nid_free = true; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; @@ -52,16 +55,35 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) err = insert_inode_locked(inode); if (err) { err = -EINVAL; - nid_free = true; goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + + err = dquot_initialize(inode); + if (err) + goto fail_drop; + + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -75,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -85,6 +116,16 @@ fail: set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int is_multimedia_file(const unsigned char *s, const char *sub) @@ -136,6 +177,10 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -148,8 +193,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +206,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -180,6 +225,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); inode->i_ctime = current_time(inode); @@ -233,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -321,12 +379,13 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (err) goto err_out; } - if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && - (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + if (f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); @@ -346,6 +405,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + err = dquot_initialize(dir); + if (err) + return err; + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { if (IS_ERR(page)) @@ -403,7 +466,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); @@ -412,6 +475,10 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -423,8 +490,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -447,7 +512,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, goto err_out; if (!fscrypt_has_encryption_key(inode)) { - err = -EPERM; + err = -ENOKEY; goto err_out; } @@ -487,6 +552,8 @@ err_out: } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -499,6 +566,10 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -508,8 +579,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -524,6 +593,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -547,6 +618,10 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -554,8 +629,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -569,6 +642,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -582,6 +657,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -595,8 +674,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -622,6 +699,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: @@ -675,6 +754,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -720,13 +812,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -778,13 +863,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + F2FS_I(old_inode)->i_pino = new_dir->i_ino; up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); - f2fs_mark_inode_dirty_sync(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -861,6 +947,22 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -908,8 +1010,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } @@ -917,18 +1019,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -950,7 +1040,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(old_dir); + f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -965,21 +1055,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(new_dir); + f2fs_mark_inode_dirty_sync(new_dir, false); f2fs_unlock_op(sbi); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 01177ecdeab8..fca87835a1da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,10 +19,11 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -45,8 +46,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_SHIFT; + mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> @@ -63,8 +64,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * @@ -157,9 +159,6 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; - if (get_nat_flag(ne, IS_DIRTY)) - return; - head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); @@ -170,25 +169,27 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } - list_move_tail(&ne->list, &head->entry_list); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + nm_i->dirty_nat_cnt++; head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + if (nat_get_blkaddr(ne) == NEW_ADDR) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -245,12 +246,24 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, + bool no_fail) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + if (no_fail) { + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + } else { + new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + if (!new) + return NULL; + if (radix_tree_insert(&nm_i->nat_root, nid, new)) { + kmem_cache_free(nat_entry_slab, new); + return NULL; + } + } + memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); nat_reset_flag(new); @@ -267,11 +280,13 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = __lookup_nat_cache(nm_i, nid); if (!e) { - e = grab_nat_entry(nm_i, nid); - node_info_from_raw_nat(&e->ni, ne); + e = grab_nat_entry(nm_i, nid, false); + if (e) + node_info_from_raw_nat(&e->ni, ne); } else { - f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || - nat_get_blkaddr(e) != ne->block_addr || + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); } } @@ -285,7 +300,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid); + e = grab_nat_entry(nm_i, ni->nid, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -367,6 +382,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -392,17 +408,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); @@ -535,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -559,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -594,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -635,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -659,15 +682,11 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - if (dn->inode->i_blocks == 0) { - f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); - goto invalidate; - } f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { @@ -675,7 +694,7 @@ static void truncate_node(struct dnode_of_data *dn) dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } -invalidate: + clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -861,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -957,9 +978,6 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - /* need to do checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); - set_new_dnode(&dn, inode, page, npage, nid); if (page) @@ -995,7 +1013,7 @@ int remove_inode_page(struct inode *inode) /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), - inode->i_blocks != 0 && inode->i_blocks != 1); + inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ truncate_node(&dn); @@ -1010,14 +1028,13 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct node_info old_ni, new_ni; + struct node_info new_ni; struct page *page; int err; @@ -1028,17 +1045,18 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { - err = -ENOSPC; + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) goto fail; - } - get_node_info(sbi, dn->nid, &old_ni); - - /* Reinitialize old_ni with new node page */ - f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); - new_ni = old_ni; +#ifdef CONFIG_F2FS_CHECK_FS + get_node_info(sbi, dn->nid, &new_ni); + f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); +#endif + new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_wait_on_page_writeback(page, NODE, true); @@ -1134,11 +1152,12 @@ repeat: if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); + err = read_node_page(page, 0); if (err < 0) { f2fs_put_page(page, 1); return ERR_PTR(err); } else if (err == LOCKED_PAGE) { + err = 0; goto page_hit; } @@ -1152,15 +1171,27 @@ repeat: goto repeat; } - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + err = -EIO; goto out_err; + } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { - f2fs_bug_on(sbi, 1); - ClearPageUptodate(page); + f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " + "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + return ERR_PTR(err); } return page; } @@ -1204,6 +1235,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: @@ -1303,16 +1335,103 @@ continue_unlock: return last_page; } +static int __write_node_page(struct page *page, bool atomic, bool *submitted, + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + .submitted = false, + .io_type = io_type, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + + set_page_writeback(page); + fio.old_blkaddr = ni.blk_addr; + write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + + if (wbc->for_reclaim) { + f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, + page->index, NODE); + submitted = NULL; + } + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, NODE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; + + if (do_balance) + f2fs_balance_fs(sbi, false); + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); +} + int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index, end; + pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1334,11 +1453,13 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } if (!IS_DNODE(page) || !is_cold_node(page)) @@ -1364,6 +1485,9 @@ continue_unlock: f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { @@ -1381,13 +1505,16 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + ret = __write_node_page(page, atomic && + page == last_page, + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } if (page == last_page) { @@ -1407,17 +1534,19 @@ continue_unlock: "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; } - - if (nwritten) - f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); +out: + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1441,6 +1570,7 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); @@ -1494,9 +1624,11 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type); + if (ret) unlock_page(page); - else + else if (submitted) nwritten++; if (--wbc->nr_to_write == 0) @@ -1517,7 +1649,7 @@ continue_unlock: } out: if (nwritten) - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); return ret; } @@ -1560,72 +1692,6 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) return ret; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - nid_t nid; - struct node_info ni; - struct f2fs_io_info fio = { - .sbi = sbi, - .type = NODE, - .op = REQ_OP_WRITE, - .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, - .page = page, - .encrypted_page = NULL, - }; - - trace_f2fs_writepage(page, NODE); - - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; - - /* get old block addr of this node page */ - nid = nid_of_node(page); - f2fs_bug_on(sbi, page->index != nid); - - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - - get_node_info(sbi, nid, &ni); - - /* This page is already truncated */ - if (unlikely(ni.blk_addr == NULL_ADDR)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - unlock_page(page); - return 0; - } - - set_page_writeback(page); - fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - - if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); - - unlock_page(page); - - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, NODE, WRITE); - - return 0; - -redirty_out: - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1633,6 +1699,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct blk_plug plug; long diff; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1645,7 +1714,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1692,100 +1761,220 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, - struct free_nid *i) -{ - list_del(&i->list); - radix_tree_delete(&nm_i->free_nid_root, i->nid); -} - -static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +static int __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool new) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; - struct nat_entry *ne; - if (!available_free_memory(sbi, FREE_NIDS)) - return -1; + if (new) { + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + } + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]++; + list_add_tail(&i->list, &nm_i->nid_list[list]); + return 0; +} + +static void __remove_nid_from_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool reuse) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]--; + list_del(&i->list); + if (!reuse) + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *e; + struct nat_entry *ne; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) - return 0; - - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return 0; - } + return false; i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return 0; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; - spin_lock(&nm_i->free_nid_list_lock); - if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->free_nid_list_lock); - radix_tree_preload_end(); - kmem_cache_free(free_nid_slab, i); - return 0; + spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); + ret = true; + err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - return 1; +err: + if (err) + kmem_cache_free(free_nid_slab, i); + return ret; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); } +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + else + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + + if (set) + nm_i->free_nid_count[nat_ofs]++; + else if (!build) + nm_i->free_nid_count[nat_ofs]--; +} + static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + bool freed = false; if (unlikely(start_nid >= nm_i->max_nid)) break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) { - if (add_free_nid(sbi, start_nid, true) < 0) - break; - } + if (blk_addr == NULL_ADDR) + freed = add_free_nid(sbi, start_nid, true); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } -void build_free_nids(struct f2fs_sb_info *sbi) +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + unsigned int i, idx; + + down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + if (!nm_i->free_nid_count[i]) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + nid_t nid; + + if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) + continue; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true); + + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + goto out; + } + } +out: + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); +} + +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1793,10 +1982,24 @@ void build_free_nids(struct f2fs_sb_info *sbi) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ - if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; + if (!sync && !available_free_memory(sbi, FREE_NIDS)) + return; + + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID_LIST]) + return; + } + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1830,7 +2033,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else - remove_free_nid(nm_i, nid); + remove_free_nid(sbi, nid); } up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); @@ -1839,6 +2042,13 @@ void build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +{ + mutex_lock(&NM_I(sbi)->build_lock); + __build_free_nids(sbi, sync, mount); + mutex_unlock(&NM_I(sbi)->build_lock); +} + /* * If this function returns success, caller can obtain a new nid * from second parameter of this function. @@ -1850,34 +2060,39 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(FAULT_ALLOC_NID); return false; + } #endif - if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) - return false; + spin_lock(&nm_i->nid_list_lock); - spin_lock(&nm_i->free_nid_list_lock); + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); - list_for_each_entry(i, &nm_i->free_nid_list, list) - if (i->state == NID_NEW) - break; - - f2fs_bug_on(sbi, i->state != NID_NEW); + if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); + i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = i->nid; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false, false); + + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); - build_free_nids(sbi); - mutex_unlock(&nm_i->build_lock); + build_free_nids(sbi, true, false); goto retry; } @@ -1889,11 +2104,11 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); - __del_from_free_nid_list(nm_i, i); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, !i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); } @@ -1910,17 +2125,24 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + if (!available_free_memory(sbi, FREE_NIDS)) { - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); need_free = true; } else { + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); i->state = NID_NEW; - nm_i->fcnt++; + __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } - spin_unlock(&nm_i->free_nid_list_lock); + + nm_i->available_nids++; + + update_free_nid_bitmap(sbi, nid, true, false); + + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1932,24 +2154,24 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->fcnt <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], + list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - if (i->state == NID_ALLOC) - continue; - __del_from_free_nid_list(nm_i, i); + + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); kmem_cache_free(free_nid_slab, i); - nm_i->fcnt--; nr_shrink--; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; @@ -1982,38 +2204,47 @@ update_inode: f2fs_put_page(ipage, 1); } -void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; + struct page *xpage; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; - /* Deallocate node address */ + /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, inode); + dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: - /* 2: allocate new xattr nid */ - if (unlikely(!inc_valid_node_count(sbi, inode))) - f2fs_bug_on(sbi, 1); + /* 2: update xattr nid in inode */ + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; - remove_free_nid(NM_I(sbi), new_xnid); - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); - f2fs_i_xnid_write(inode, new_xnid); + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } - /* 3: update xattr blkaddr */ - refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr, false); + alloc_nid_done(sbi, new_xnid); + update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); + + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + return 0; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2035,7 +2266,7 @@ retry: } /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + remove_free_nid(sbi, ino); if (!PageUptodate(ipage)) SetPageUptodate(ipage); @@ -2049,12 +2280,19 @@ retry: dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL))) + if (unlikely(inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); @@ -2069,7 +2307,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_node *rn; struct f2fs_summary *sum_entry; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(sbi); int i, idx, last_offset, nrpages; /* scan the node segment */ @@ -2078,7 +2315,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, bio_blocks); + nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ ra_meta_pages(sbi, addr, nrpages, META_POR, true); @@ -2117,9 +2354,22 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid); + ne = grab_nat_entry(nm_i, nid, true); node_info_from_raw_nat(&ne->ni, &raw_ne); } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + __set_nat_cache_dirty(nm_i, ne); } update_nats_in_cursum(journal, -i); @@ -2144,8 +2394,39 @@ add_out: list_add_tail(&nes->set_list, head); } +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { + if (start_nid == 0 && i == 0) + valid++; + if (nat_blk->entries[i].block_addr) + valid++; + } + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); +} + static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, - struct nat_entry_set *set) + struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; @@ -2160,7 +2441,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -2177,8 +2459,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nid_t nid = nat_get_nid(ne); int offset; - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { offset = lookup_journal_in_cursum(journal, @@ -2191,26 +2472,38 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); - if (nat_get_blkaddr(ne) == NULL_ADDR) + __clear_nat_cache_dirty(NM_I(sbi), set, ne); + if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + NM_I(sbi)->available_nids++; + update_free_nid_bitmap(sbi, nid, true, false); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false, false); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } - if (to_journal) + if (to_journal) { up_write(&curseg->journal_rwsem); - else + } else { + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); + } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2231,7 +2524,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -2245,11 +2539,86 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set); + __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); + /* Allow dirty nats by node block allocation in write_begin */ +} - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + + F2FS_BLKSIZE - 1); + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, + GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page = get_meta_page(sbi, nat_bits_addr++); + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (cur_cp_crc(ckpt) << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; + } + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); + return 0; +} + +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + + spin_lock(&NM_I(sbi)->nid_list_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -2257,32 +2626,35 @@ static int init_node_manager(struct f2fs_sb_info *sbi) struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned char *version_bitmap; - unsigned int nat_segs, nat_blocks; + unsigned int nat_segs; + int err; nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; - nm_i->fcnt = 0; + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; + nm_i->nid_cnt[FREE_NID_LIST] = 0; + nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); + INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); + spin_lock_init(&nm_i->nid_list_lock); init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); @@ -2295,6 +2667,39 @@ static int init_node_manager(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_bitmap) return -ENOMEM; + + err = __get_nat_bitmaps(sbi); + if (err) + return err; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + + return 0; +} + +static int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * + NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * + sizeof(unsigned short), GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; return 0; } @@ -2310,7 +2715,14 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); + err = init_free_nid_cache(sbi); + if (err) + return err; + + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + + build_free_nids(sbi, true, true); return 0; } @@ -2327,17 +2739,18 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(sbi, i->state == NID_ALLOC); - __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], + list) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); + f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); + f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ down_write(&nm_i->nat_tree_lock); @@ -2367,7 +2780,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) } up_write(&nm_i->nat_tree_lock); + kvfree(nm_i->nat_block_bitmap); + kvfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_count); + kfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bits); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(nm_i->nat_bitmap_mir); +#endif sbi->nm_info = NULL; kfree(nm_i); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 868bec65e51c..bb53e9955ff2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) @@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - spin_lock(&nm_i->free_nid_list_lock); - if (nm_i->fcnt <= 0) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); } /* @@ -185,6 +186,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } @@ -193,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) @@ -214,11 +224,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - if ((block_addr >> sbi->log_blocks_per_seg) % 2) - block_addr -= sbi->blocks_per_seg; - else - block_addr += sbi->blocks_per_seg; - + block_addr ^= 1 << sbi->log_blocks_per_seg; return block_addr + nm_i->nat_blkaddr; } @@ -227,6 +233,9 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif } static inline nid_t ino_of_node(struct page *node_page) @@ -290,14 +299,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } @@ -305,15 +311,12 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) static inline bool is_recoverable_dnode(struct page *page) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); __u64 cp_ver = cur_cp_version(ckpt); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } - return cpu_to_le64(cp_ver) == cpver_of_node(page); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + + return cp_ver == cpver_of_node(page); } /* @@ -342,7 +345,7 @@ static inline bool IS_DNODE(struct page *node_page) unsigned int ofs = ofs_of_node(node_page); if (f2fs_has_xattr_block(ofs)) - return false; + return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 98c1a63a4614..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ retry: err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -180,13 +202,15 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + F2FS_I(inode)->i_advise = raw->i_advise; + if (file_enc_name(inode)) name = ""; else @@ -196,33 +220,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static bool is_same_inode(struct inode *inode, struct page *ipage) -{ - struct f2fs_inode *ri = F2FS_INODE(ipage); - struct timespec disk; - - if (!IS_INODE(ipage)) - return true; - - disk.tv_sec = le64_to_cpu(ri->i_ctime); - disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - if (timespec_compare(&inode->i_ctime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_atime); - disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - if (timespec_compare(&inode->i_atime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_mtime); - disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - if (timespec_compare(&inode->i_mtime, &disk) > 0) - return false; - - return true; -} - -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -248,21 +247,23 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (!is_same_inode(entry->inode, page)) - goto next; - } else { - if (IS_INODE(page) && is_dent_dnode(page)) { + if (!entry) { + bool quota_inode = false; + + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -353,10 +354,18 @@ got_it: f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -386,7 +395,8 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -405,11 +415,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - /* - * Deprecated; xattr blocks should be found from cold log. - * But, we should remain this for backward compatibility. - */ - recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page, blkaddr); + if (!err) + recovered++; goto out; } @@ -441,8 +449,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) @@ -454,8 +462,10 @@ retry_dn: continue; } - if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) - f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + if (!file_keep_isize(inode) && + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block @@ -507,8 +517,10 @@ err: f2fs_put_dnode(&dn); out: f2fs_msg(sbi->sb, KERN_NOTICE, - "recover_data: ino = %lx, recovered = %d blocks, err = %d", - inode->i_ino, recovered, err); + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); return err; } @@ -576,18 +588,31 @@ next: int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; struct list_head dir_list; - block_t blkaddr; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -595,16 +620,14 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); - blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -613,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -639,5 +662,12 @@ out: } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 74a2b444406d..e71b37b00b4a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,17 +16,20 @@ #include #include #include +#include +#include #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; -static struct kmem_cache *bio_entry_slab; +static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -166,6 +169,21 @@ found: return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -212,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } @@ -242,11 +266,43 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - clear_inode_flag(inode, FI_ATOMIC_FILE); - mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); + stat_dec_atomic_write(inode); +} + +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } static int __commit_inmem_pages(struct inode *inode, @@ -259,10 +315,10 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_PRIO, - .encrypted_page = NULL, + .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; - bool submit_bio = false; + pgoff_t last_idx = ULONG_MAX; int err = 0; list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { @@ -274,28 +330,35 @@ static int __commit_inmem_pages(struct inode *inode, set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); - + remove_dirty_inode(inode); + } +retry: fio.page = page; + fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - - clear_cold_data(page); - submit_bio = true; + last_idx = page->index; } unlock_page(page); list_move_tail(&cur->list, revoke_list); } - if (submit_bio) - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -314,6 +377,8 @@ int commit_inmem_pages(struct inode *inode) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); + set_inode_flag(inode, FI_ATOMIC_COMMIT); + mutex_lock(&fi->inmem_lock); err = __commit_inmem_pages(inode, &revoke_list); if (err) { @@ -335,6 +400,8 @@ int commit_inmem_pages(struct inode *inode) } mutex_unlock(&fi->inmem_lock); + clear_inode_flag(inode, FI_ATOMIC_COMMIT); + f2fs_unlock_op(sbi); return err; } @@ -346,15 +413,14 @@ int commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -363,7 +429,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -380,14 +446,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi); + build_free_nids(sbi, false, false); + + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) + return; /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || !available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || - (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; @@ -400,38 +469,69 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; + bio->bi_bdev = bdev; + ret = submit_bio_wait(bio); + bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi) +{ + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); + int i; + + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + } + return ret; +} + static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { - struct bio *bio; struct flush_cmd *cmd, *next; int ret; - bio = f2fs_bio_alloc(0); - fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - ret = submit_bio_wait(bio); + ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } - bio_put(bio); fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -439,38 +539,63 @@ repeat: int f2fs_issue_flush(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; - - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - struct bio *bio = f2fs_bio_alloc(0); - int ret; + if (!test_opt(sbi, FLUSH_MERGE)) { + ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + return ret; + } - atomic_inc(&fcc->submit_flush); - bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - ret = submit_bio_wait(bio); - atomic_dec(&fcc->submit_flush); - bio_put(bio); + if (atomic_inc_return(&fcc->issing_flush) == 1) { + ret = submit_flush_wait(sbi); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } + } return cmd.ret; } @@ -481,33 +606,51 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; + goto init_thread; + } + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); - SM_I(sbi)->cmd_control_info = fcc; + SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + +init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; return err; } return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; - if (fcc && fcc->f2fs_issue_flush) - kthread_stop(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->fcc_info = NULL; + } } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -550,8 +693,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) - clear_bit(GET_SECNO(sbi, segno), + if (get_valid_blocks(sbi, segno, true) == 0) + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -571,7 +714,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -586,120 +729,647 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, - struct bio *bio) +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc; - INIT_LIST_HEAD(&be->list); - be->bio = bio; - init_completion(&be->event); - list_add_tail(&be->list, wait_list); + f2fs_bug_on(sbi, !len); - return be; + pend_list = &dcc->pend_list[plist_idx(len)]; + + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + INIT_LIST_HEAD(&dc->list); + dc->bdev = bdev; + dc->lstart = lstart; + dc->start = start; + dc->len = len; + dc->ref = 0; + dc->state = D_PREP; + dc->error = 0; + init_completion(&dc->wait); + list_add_tail(&dc->list, pend_list); + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; + + return dc; } -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be, *tmp; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; - list_for_each_entry_safe(be, tmp, wait_list, list) { - struct bio *bio = be->bio; - int err; + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); - wait_for_completion_io(&be->event); - err = be->error; - if (err == -EOPNOTSUPP) - err = 0; + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); + return dc; +} - bio_put(bio); - list_del(&be->list); - kmem_cache_free(bio_entry_slab, be); +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_dec(&dcc->issing_discard); + + list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + f2fs_bug_on(sbi, dc->ref); + + if (dc->error == -EOPNOTSUPP) + dc->error = 0; + + if (dc->error) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->lstart, dc->start, dc->len, dc->error); + __detach_discard_cmd(dcc, dc); +} + +static void f2fs_submit_discard_endio(struct bio *bio) +{ + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + + dc->error = bio->bi_error; + dc->state = D_DONE; + complete_all(&dc->wait); + bio_put(bio); +} + +void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk = START_BLOCK(sbi, segno + 1); } -} - -static void f2fs_submit_bio_wait_endio(struct bio *bio) -{ - struct bio_entry *be = (struct bio_entry *)bio->bi_private; - - be->error = bio->bi_error; - complete(&be->event); +#endif } /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) { - struct block_device *bdev = sbi->sb->s_bdev; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct bio *bio = NULL; - int err; - err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, - &bio); - if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio); + if (dc->state != D_PREP) + return; - bio->bi_private = be; - bio->bi_end_io = f2fs_submit_bio_wait_endio; - bio->bi_opf |= REQ_SYNC; - submit_bio(bio); + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + list_move_tail(&dc->list, &dcc->wait_list); + __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); + } + } else { + __remove_discard_cmd(sbi, dc); + } +} + +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; } - return err; + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + mutex_unlock(&dcc->cmd_lock); +} + +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + block_t lblkstart = blkstart; + + trace_f2fs_queue_discard(bdev, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + return 0; +} + +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int iter = 0, issued = 0; + int i; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { + __submit_discard_cmd(sbi, dc); + issued++; + + if (fatal_signal_pending(current)) + break; + continue; + } + + if (!issue_cond) { + __submit_discard_cmd(sbi, dc); + issued++; + continue; + } + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } + + if (++iter >= DISCARD_ISSUE_RATE) + goto out; + } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); + } +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; + bool need_wait; + +next: + need_wait = false; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } else { + dc->ref++; + need_wait = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + __wait_one_discard_bio(sbi, dc); + goto next; + } +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) + __wait_one_discard_bio(sbi, dc); +} + +void stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + +/* This comes from f2fs_put_super and f2fs_trim_fs */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +{ + __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); + __wait_discard_cmd(sbi, false); +} + +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; + + set_freezable(); + + do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (try_to_freeze()) + continue; + if (kthread_should_stop()) + return 0; + + if (dcc->discard_wake) { + dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } + + sb_end_intwrite(sbi->sb); + + } while (!kthread_should_stop()); + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + sector_t sector, nr_sects; + block_t lblkstart = blkstart; + int devi = 0; + + if (sbi->s_ndevs) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, bdev, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } + trace_f2fs_issue_reset_zone(bdev, blkstart); + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(blkstart); - sector_t len = SECTOR_FROM_BLOCK(blklen); + sector_t start = blkstart, len = 0; + struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } - for (i = blkstart; i < blkstart + blklen; i++) { se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); - return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0); + + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->discard_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->nr_discards += end - start; -} - -static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; @@ -709,16 +1379,19 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) - return; + return false; if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) - return; + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) + return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ @@ -726,7 +1399,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; @@ -736,13 +1410,27 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) && (end - start) < cpc->trim_minlen) continue; - __add_discard_entry(sbi, cpc, se, start, end); + if (check_only) + return true; + + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } + return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -768,16 +1456,14 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); - - blk_start_plug(&plug); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -794,19 +1480,23 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (force || !test_opt(sbi, DISCARD)) + if (!test_opt(sbi, DISCARD)) continue; + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); @@ -820,17 +1510,100 @@ next: /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (f2fs_sb_mounted_blkzoned(sbi->sb) || + (force && len < cpc->trim_minlen)) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->nr_discards -= entry->len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - blk_finish_plug(&plug); + wake_up_discard_thread(sbi, false); +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc; + int err = 0, i; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) { + INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } + INIT_LIST_HEAD(&dcc->wait_list); + mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); + dcc->nr_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->undiscard_blks = 0; + dcc->root = RB_ROOT; + + init_waitqueue_head(&dcc->discard_wait_queue); + SM_I(sbi)->dcc_info = dcc; +init_thread: + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + return err; + } + + return err; +} + +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (!dcc) + return; + + stop_discard_thread(sbi); + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -859,6 +1632,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -875,14 +1652,54 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; + } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; + + /* don't overwrite by SSR to keep node chain */ + if (se->type == CURSEG_WARM_NODE) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; + } } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; + } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -1068,8 +1885,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1079,8 +1896,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1111,8 +1928,8 @@ find_other_zone: secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1156,7 +1973,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; @@ -1169,6 +1986,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1187,6 +2018,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); @@ -1229,7 +2061,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1250,28 +2082,53 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; + int i, cnt; + bool reversed = false; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) - return v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); + /* need_SSR() already forces to do this */ + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; + return 1; + } - /* For data segments, let's do SSR more intensively */ - for (; type >= CURSEG_HOT_DATA; type--) - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, type, SSR)) + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(type)) { + if (type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; + } else { + if (type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; + } + + for (; cnt-- > 0; reversed ? i-- : i++) { + if (i == type) + continue; + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } + } return 0; } @@ -1286,43 +2143,55 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); - else if (type == CURSEG_WARM_NODE) + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); stat_inc_seg_type(sbi, curseg); } -static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int old_segno; - - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); - locate_dirty_segment(sbi, old_segno); -} - void allocate_new_segments(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg; + unsigned int old_segno; int i; - if (test_opt(sbi, LFS)) - return; - - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segments(sbi, i); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + bool has_candidate = false; + + mutex_lock(&SIT_I(sbi)->sentry_lock); + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; + break; + } + } + mutex_unlock(&SIT_I(sbi)->sentry_lock); + + cpc->trim_start = trim_start; + return has_candidate; +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); @@ -1373,6 +2242,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -1386,87 +2258,91 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -static int __get_segment_type_2(struct page *page, enum page_type p_type) +static int __get_segment_type_2(struct f2fs_io_info *fio) { - if (p_type == DATA) + if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } -static int __get_segment_type_4(struct page *page, enum page_type p_type) +static int __get_segment_type_4(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && is_cold_node(page)) + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } -static int __get_segment_type_6(struct page *page, enum page_type p_type) +static int __get_segment_type_6(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { - if (IS_DNODE(page)) - return is_cold_node(page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } -static int __get_segment_type(struct page *page, enum page_type p_type) +static int __get_segment_type(struct f2fs_io_info *fio) { - switch (F2FS_P_SB(page)->active_logs) { + int type = 0; + + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(page, p_type); + type = __get_segment_type_2(fio); + break; case 4: - return __get_segment_type_4(page, p_type); + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); } - /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(F2FS_P_SB(page), - F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); - return __get_segment_type_6(page, p_type); + + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type) + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - bool direct_io = (type == CURSEG_DIRECT_IO); - - type = direct_io ? CURSEG_WARM_DATA : type; - - curseg = CURSEG_I(sbi, type); + struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); - /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0, 0)) - __allocate_new_segments(sbi, type); - *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); + /* * __add_sum_entry should be resided under the curseg_mutex * because, this function updates a summary entry in the @@ -1481,54 +2357,72 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + + if (add_list) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + mutex_unlock(&curseg->curseg_mutex); } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio->page, fio->type); - - if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type]); + int type = __get_segment_type(fio); + int err; +reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type); + &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(fio); - - if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type]); + err = f2fs_submit_page_write(fio); + if (err == -EAGAIN) { + fio->old_blkaddr = fio->new_blkaddr; + goto reallocate; + } } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, .type = META, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, + .in_list = false, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -1537,6 +2431,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -1550,13 +2446,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -1598,7 +2503,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -1617,7 +2522,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } @@ -1647,7 +2552,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, type); if (ordered) wait_on_page_writeback(page); else @@ -1655,8 +2561,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; @@ -2127,9 +3032,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); } if (to_journal) { @@ -2163,9 +3068,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); + + cpc->trim_start = trim_start; } mutex_unlock(&sit_i->sentry_lock); @@ -2175,10 +3084,9 @@ out: static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; - char *src_bitmap, *dst_bitmap; + char *src_bitmap; unsigned int bitmap_size; /* allocate memory for SIT information */ @@ -2188,13 +3096,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -2207,6 +3115,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map_mir) + return -ENOMEM; +#endif + if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2220,7 +3135,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -2233,22 +3148,27 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); - if (!dst_bitmap) + sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) + return -ENOMEM; +#endif + /* init SIT information */ sit_i->s_ops = &default_salloc_ops; sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); - sit_i->sit_bitmap = dst_bitmap; + sit_i->written_valid_blocks = 0; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); - sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; + sit_i->mounted_time = ktime_get_real_seconds(); mutex_init(&sit_i->sentry_lock); return 0; } @@ -2266,12 +3186,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2324,10 +3244,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); + readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; @@ -2347,10 +3267,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2374,10 +3301,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2396,6 +3328,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ @@ -2418,7 +3353,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { @@ -2436,7 +3371,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2458,7 +3393,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2524,22 +3459,22 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - - INIT_LIST_HEAD(&sm_info->discard_list); - INIT_LIST_HEAD(&sm_info->wait_list); - sm_info->nr_discards = 0; - sm_info->max_discards = 0; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; INIT_LIST_HEAD(&sm_info->sit_entry_set); - if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) return err; } + err = create_discard_cmd_control(sbi); + if (err) + return err; + err = build_sit_info(sbi); if (err) return err; @@ -2633,6 +3568,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) { for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sentries[start].cur_valid_map_mir); +#endif kfree(sit_i->sentries[start].ckpt_valid_map); kfree(sit_i->sentries[start].discard_map); } @@ -2645,6 +3583,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sit_bitmap_mir); +#endif kfree(sit_i); } @@ -2654,7 +3595,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; - destroy_flush_cmd_control(sbi); + destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -2670,15 +3612,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; - bio_entry_slab = f2fs_kmem_cache_create("bio_entry", - sizeof(struct bio_entry)); - if (!bio_entry_slab) + discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_bio_entry; + goto destroy_discard_cmd; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2688,8 +3630,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destroy_bio_entry: - kmem_cache_destroy(bio_entry_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2699,7 +3641,7 @@ fail: void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); - kmem_cache_destroy(bio_entry_slab); + kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b164f8339281..e0a6cc23ace3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,79 +18,91 @@ #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ -/* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +/* L: Logical segment # in volume, R: Relative segment # in main area */ +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) + +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) + +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ + ((segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -101,9 +113,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) -#define MAX_BIO_BLOCKS(sbi) \ - ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -132,7 +142,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -164,6 +177,9 @@ struct seg_entry { unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. @@ -186,9 +202,12 @@ struct segment_allocation { * the page is atomically written, and it is in inmem_pages list. */ #define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) struct inmem_pages { struct list_head list; @@ -203,6 +222,9 @@ struct sit_info { block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ +#endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ @@ -218,6 +240,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { @@ -294,17 +318,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -317,6 +341,9 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } @@ -346,8 +373,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -367,7 +394,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -378,8 +406,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -400,7 +428,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -414,6 +443,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } @@ -457,26 +492,9 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; -} - -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi) + 1); + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -484,14 +502,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); + (node_secs + 2 * dent_secs + imeta_secs + + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -521,6 +539,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, @@ -528,20 +547,22 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -553,6 +574,15 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) @@ -633,6 +663,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, check_seg_range(sbi, start); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; @@ -658,13 +694,17 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); - return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - - sit_i->mounted_time; + time64_t now = ktime_get_real_seconds(); + + return sit_i->elapsed_time + now - sit_i->mounted_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, @@ -691,7 +731,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= sbi->fggc_threshold) return true; return false; @@ -704,19 +744,12 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(queue_max_sectors(q)); -} - /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, - * 512 pages (2MB) * 3 for three types of nodes, and - * max_bio_blocks for meta are set. + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { @@ -728,7 +761,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * MAX_BIO_BLOCKS(sbi); + return 8 * BIO_MAX_PAGES; else return 0; } @@ -745,12 +778,35 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - + desired = BIO_MAX_PAGES; if (type == NODE) - desired = 2 * max_hw_blocks(sbi); - else - desired = MAX_BIO_BLOCKS(sbi); + desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 46c915425923..5c60fc28ec75 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -21,14 +21,16 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) - return NM_I(sbi)->fcnt - MAX_FREE_NIDS; - return 0; + long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b81998e9e91d..511bcecf522a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include #include "f2fs.h" #include "node.h" @@ -35,9 +37,7 @@ #define CREATE_TRACE_POINTS #include -static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; -static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; @@ -82,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -89,6 +91,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_noinline_xattr, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -101,9 +104,24 @@ enum { Opt_noinline_data, Opt_data_flush, Opt_mode, + Opt_io_size_bits, Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, + Opt_usrquota, + Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -114,6 +132,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -121,6 +140,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -133,210 +153,27 @@ static match_table_t f2fs_tokens = { {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_mode, "mode=%s"}, + {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; -/* Sysfs support for f2fs */ -enum { - GC_THREAD, /* struct f2fs_gc_thread */ - SM_INFO, /* struct f2fs_sm_info */ - NM_INFO, /* struct f2fs_nm_info */ - F2FS_SBI, /* struct f2fs_sb_info */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - FAULT_INFO_RATE, /* struct f2fs_fault_info */ - FAULT_INFO_TYPE, /* struct f2fs_fault_info */ -#endif -}; - -struct f2fs_attr { - struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); - int struct_type; - int offset; -}; - -static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) -{ - if (struct_type == GC_THREAD) - return (unsigned char *)sbi->gc_thread; - else if (struct_type == SM_INFO) - return (unsigned char *)SM_I(sbi); - else if (struct_type == NM_INFO) - return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) - return (unsigned char *)sbi; -#ifdef CONFIG_F2FS_FAULT_INJECTION - else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; -#endif - return NULL; -} - -static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); -} - -static ssize_t f2fs_sbi_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - unsigned char *ptr = NULL; - unsigned int *ui; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned char *ptr; - unsigned long t; - unsigned int *ui; - ssize_t ret; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret < 0) - return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) - return -EINVAL; -#endif - *ui = t; - return count; -} - -static ssize_t f2fs_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void f2fs_sb_release(struct kobject *kobj) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .struct_type = _struct_type, \ - .offset = _offset \ -} - -#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ - F2FS_ATTR_OFFSET(struct_type, name, 0644, \ - f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) - -#define F2FS_GENERAL_RO_ATTR(name) \ -static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) - -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -#ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); -#endif -F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); - -#define ATTR_LIST(name) (&f2fs_attr_##name.attr) -static struct attribute *f2fs_attrs[] = { - ATTR_LIST(gc_min_sleep_time), - ATTR_LIST(gc_max_sleep_time), - ATTR_LIST(gc_no_gc_sleep_time), - ATTR_LIST(gc_idle), - ATTR_LIST(reclaim_segments), - ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), - ATTR_LIST(ipu_policy), - ATTR_LIST(min_ipu_util), - ATTR_LIST(min_fsync_blocks), - ATTR_LIST(max_victim_search), - ATTR_LIST(dir_level), - ATTR_LIST(ram_thresh), - ATTR_LIST(ra_nid_pages), - ATTR_LIST(dirty_nats_ratio), - ATTR_LIST(cp_interval), - ATTR_LIST(idle_interval), -#ifdef CONFIG_F2FS_FAULT_INJECTION - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), -#endif - ATTR_LIST(lifetime_write_kbytes), - NULL, -}; - -static const struct sysfs_ops f2fs_attr_ops = { - .show = f2fs_attr_show, - .store = f2fs_attr_store, -}; - -static struct kobj_type f2fs_ktype = { - .default_attrs = f2fs_attrs, - .sysfs_ops = &f2fs_attr_ops, - .release = f2fs_sb_release, -}; - void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -345,7 +182,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -356,6 +193,104 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; +} +#endif + static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -363,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -412,17 +350,26 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else { + } else if (!f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); } break; case Opt_nodiscard: + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } clear_opt(sbi, DISCARD); + break; case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -433,6 +380,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_xattr: set_opt(sbi, INLINE_XATTR); break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -446,6 +396,10 @@ static int parse_options(struct super_block *sb, char *options) f2fs_msg(sb, KERN_INFO, "inline_xattr options not supported"); break; + case Opt_noinline_xattr: + f2fs_msg(sb, KERN_INFO, + "noinline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_acl: @@ -512,6 +466,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { @@ -522,11 +483,23 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_io_size_bits: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + f2fs_msg(sb, KERN_WARNING, + "Not support %d, larger than %d", + 1 << arg, BIO_MAX_PAGES); + return -EINVAL; + } + sbi->write_io_size_bits = arg; + break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg); + set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -538,6 +511,81 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nolazytime: sb->s_flags &= ~MS_LAZYTIME; break; +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: + f2fs_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -545,6 +593,17 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif + + if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "Should set mode=lfs with %uKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } return 0; } @@ -570,14 +629,22 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); +#ifdef CONFIG_QUOTA + memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); + fi->i_reserved_quota = 0; +#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } static int f2fs_drop_inode(struct inode *inode) { + int ret; /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) @@ -610,30 +677,33 @@ static int f2fs_drop_inode(struct inode *inode) spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } + trace_f2fs_drop_inode(inode, 0); return 0; } - - return generic_drop_inode(inode); + ret = generic_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; } -int f2fs_inode_dirtied(struct inode *inode) +int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return 1; + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); } - - set_inode_flag(inode, FI_DIRTY_INODE); - list_add_tail(&F2FS_I(inode)->gdirty_list, + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); - inc_page_count(sbi, F2FS_DIRTY_IMETA); - stat_inc_dirty_inode(sbi, DIRTY_META); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } spin_unlock(&sbi->inode_lock[DIRTY_META]); - - return 0; + return ret; } void f2fs_inode_synced(struct inode *inode) @@ -645,10 +715,12 @@ void f2fs_inode_synced(struct inode *inode) spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } - list_del_init(&F2FS_I(inode)->gdirty_list); + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); - dec_page_count(sbi, F2FS_DIRTY_IMETA); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } @@ -672,7 +744,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); - f2fs_inode_dirtied(inode); + f2fs_inode_dirtied(inode, false); } static void f2fs_i_callback(struct rcu_head *head) @@ -692,18 +764,25 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->total_valid_inode_count); } +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - kobject_del(&sbi->s_kobj); - - stop_gc_thread(sbi); + f2fs_quota_off_umount(sb); /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -721,6 +800,16 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* be sure to wait for any on-going discard commands */ + f2fs_wait_discard_bios(sbi); + + if (f2fs_discard_en(sbi) && !sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } + /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); @@ -729,13 +818,12 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_ino_entry(sbi, true); - release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -745,15 +833,23 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + destroy_device_list(sbi); + mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); kfree(sbi); } @@ -764,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -780,13 +879,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { - int err; - if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; } static int f2fs_unfreeze(struct super_block *sb) @@ -794,12 +897,55 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; + u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -810,18 +956,67 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi); + buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + sbi->reserved_blocks; - buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = buf->f_files - valid_inode_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_bavail); + } buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -839,7 +1034,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -847,6 +1044,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nouser_xattr"); if (test_opt(sbi, INLINE_XATTR)) seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -883,89 +1082,42 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (F2FS_IO_SIZE_BITS(sbi)) + seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); +#endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); +#endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } -static int segment_info_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i; - - seq_puts(seq, "format: segment_type|valid_blocks\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - if ((i % 10) == 0) - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); - if ((i % 10) == 9 || i == (total_segs - 1)) - seq_putc(seq, '\n'); - else - seq_putc(seq, ' '); - } - - return 0; -} - -static int segment_bits_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i, j; - - seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); - for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, " %.2x", se->cur_valid_map[j]); - seq_putc(seq, '\n'); - } - return 0; -} - -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); - static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { @@ -988,6 +1140,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; @@ -995,14 +1148,37 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we * need to restore them. */ org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -1012,7 +1188,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sbi->mount_opt.opt = 0; default_options(sbi); /* parse mount options */ @@ -1027,6 +1202,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else { + /* dquot_resume needs RW */ + sb->s_flags &= ~MS_RDONLY; + dquot_resume(sb, -1); + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1067,13 +1252,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if flush_merge is not passed in mount option. */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { - destroy_flush_cmd_control(sbi); - } else if (!SM_I(sbi)->cmd_control_info) { + clear_opt(sbi, FLUSH_MERGE); + destroy_flush_cmd_control(sbi, false); + } else { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -1088,21 +1279,286 @@ restore_gc: stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; + sb->s_flags = old_sb_flags; #ifdef CONFIG_F2FS_FAULT_INJECTION sbi->fault_info = ffi; #endif return err; } -static struct super_operations f2fs_sops = { +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + char *kaddr; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_mapping_page(mapping, blkidx, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return -EIO; + } + + kaddr = kmap_atomic(page); + memcpy(data, kaddr + offset, tocopy); + kunmap_atomic(kaddr); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + char *kaddr; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); + + err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + &page, NULL); + if (unlikely(err)) + break; + + kaddr = kmap_atomic(page); + memcpy(kaddr + offset, data, tocopy); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, NULL); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return 0; + inode->i_version++; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + + ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + if (ret) + return ret; + + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); + } + return 0; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + struct inode *inode; + int err; + + err = f2fs_quota_sync(sb, type); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + f2fs_quota_sync(sb, type); + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + f2fs_quota_off(sb, type); +} + +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, @@ -1120,12 +1576,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } -static int f2fs_key_prefix(struct inode *inode, u8 **key) -{ - *key = F2FS_I_SB(inode)->key_prefix; - return F2FS_I_SB(inode)->key_prefix_size; -} - static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -1140,16 +1590,16 @@ static unsigned f2fs_max_namelen(struct inode *inode) inode->i_sb->s_blocksize : F2FS_NAME_LEN; } -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", .get_context = f2fs_get_context, - .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; #else -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .is_encrypted = f2fs_encrypted_inode, }; #endif @@ -1201,9 +1651,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); @@ -1229,7 +1686,7 @@ static int __f2fs_commit_super(struct buffer_head *bh, unlock_buffer(bh); /* it's rare case, we can do fua all the time */ - return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, @@ -1424,6 +1881,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; unsigned int main_segs, blocks_per_seg; int i; @@ -1437,20 +1895,28 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; - main_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + + main_segs = le32_to_cpu(raw_super->segment_count_main); blocks_per_seg = sbi->blocks_per_seg; for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || - le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) { + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; - } } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || - le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) { + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; - } } if (unlikely(f2fs_cp_error(sbi))) { @@ -1463,7 +1929,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; + int i, j; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1491,17 +1957,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - mutex_init(&sbi->wio_mutex[NODE]); - mutex_init(&sbi->wio_mutex[DATA]); + for (i = 0; i < NR_PAGE_TYPE - 1; i++) + for (j = HOT; j < NR_TEMP_TYPE; j++) + mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; -#endif } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -1516,6 +1979,71 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) GFP_KERNEL); } +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) +{ + struct block_device *bdev = FDEV(devi).bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + return 0; + + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + return -EINVAL; + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + FDEV(devi).nr_blkz++; + + FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + if (!FDEV(devi).blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + FDEV(devi).blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1608,6 +2136,104 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; + int i; + + /* Initialize single device information */ + if (!RDEV(0).path[0]) { +#ifdef CONFIG_BLK_DEV_ZONED + if (!bdev_is_zoned(sbi->sb->s_bdev)) + return 0; + max_devices = 1; +#else + return 0; +#endif + } + + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + + for (i = 0; i < max_devices; i++) { + + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, + sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + } + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + if (max_devices == 1) + break; + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + f2fs_msg(sbi->sb, KERN_INFO, + "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); + return 0; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -1655,6 +2281,24 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + err = -EOPNOTSUPP; + goto free_sb_buf; + } +#endif default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1673,6 +2317,12 @@ try_onemore: sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; +#endif + sb->s_op = &f2fs_sops; sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; @@ -1681,25 +2331,41 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); - memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->read_io.io_rwsem); - sbi->read_io.sbi = sbi; - sbi->read_io.bio = NULL; + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { - init_rwsem(&sbi->write_io[i].io_rwsem); - sbi->write_io[i].sbi = sbi; - sbi->write_io[i].bio = NULL; + int n = (i == META) ? 1: NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); + if (!sbi->write_io[i]) { + err = -ENOMEM; + goto free_options; + } + + for (j = HOT; j < n; j++) { + init_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + } } init_rwsem(&sbi->cp_rwsem); @@ -1710,12 +2376,21 @@ try_onemore: if (err) goto free_options; + if (F2FS_IO_SIZE(sbi) > 1) { + sbi->write_io_dummy = + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); + if (!sbi->write_io_dummy) { + err = -ENOMEM; + goto free_options; + } + } + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_options; + goto free_io_dummy; } err = get_valid_checkpoint(sbi); @@ -1724,6 +2399,13 @@ try_onemore: goto free_meta_inode; } + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -1732,6 +2414,7 @@ try_onemore: sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -1779,10 +2462,9 @@ try_onemore: f2fs_join_shrinker(sbi); - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); + err = f2fs_build_stats(sbi); if (err) - goto free_node_inode; + goto free_nm; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -1803,26 +2485,14 @@ try_onemore: goto free_root_inode; } - err = f2fs_build_stats(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); if (err) - goto free_proc; + goto free_sysfs; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -1833,7 +2503,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_kobj; + goto free_meta; } if (need_fsck) @@ -1847,7 +2517,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_kobj; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -1856,7 +2526,7 @@ try_onemore: err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_kobj; + goto free_sysfs; } } skip_recovery: @@ -1871,7 +2541,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_kobj; + goto free_meta; } kfree(options); @@ -1883,22 +2553,23 @@ skip_recovery: sbi->valid_super_block ? 1 : 2, err); } + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); return 0; -free_kobj: +free_meta: f2fs_sync_inode_meta(sbi); - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); -free_proc: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - f2fs_destroy_stats(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -1909,16 +2580,27 @@ free_node_inode: f2fs_leave_shrinker(sbi); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); +free_devices: + destroy_device_list(sbi); kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); +free_io_dummy: + mempool_destroy(sbi->write_io_dummy); free_options: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); @@ -1944,8 +2626,11 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { - if (sb->s_root) + if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + stop_gc_thread(F2FS_SB(sb)); + stop_discard_thread(F2FS_SB(sb)); + } kill_block_super(sb); } @@ -1999,30 +2684,26 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) { - err = -ENOMEM; + err = f2fs_init_sysfs(); + if (err) goto free_extent_cache; - } err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_kset; - + goto free_sysfs; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; err = f2fs_create_root_stats(); if (err) goto free_filesystem; - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_kset: - kset_unregister(f2fs_kset); +free_sysfs: + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2039,11 +2720,10 @@ fail: static void __exit exit_f2fs_fs(void) { - remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - kset_unregister(f2fs_kset); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); @@ -2058,3 +2738,4 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); + diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000000..e2c258f717cd --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,556 @@ +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" + +static struct proc_dir_entry *f2fs_proc_root; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif + RESERVED_BLOCKS, +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; + int id; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif + return NULL; +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if ((unsigned long)sbi->total_valid_block_count + t > + (unsigned long)sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + spin_unlock(&sbi->stat_lock); + return count; + } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + + *ui = t; + return count; + } + + *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), + ATTR_LIST(reserved_blocks), + NULL, +}; + +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_sb_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, false)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_bits_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, false)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); + +int __init f2fs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; +} + +void f2fs_exit_sysfs(void) +{ + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; +} + +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); + } + return 0; +} + +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) +{ + if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); +} diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a..bccbbf2616d2 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3e1c0280f866..7c65540148f8 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -217,18 +217,123 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } +static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, + void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + + list_for_each_xattr(entry, base_addr) { + if ((void *)entry + sizeof(__u32) > base_addr + inline_size || + (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > + base_addr + inline_size) { + *last_addr = entry; + return NULL; + } + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + void *cur_addr, *txattr_addr, *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; + unsigned int inline_size = inline_xattr_size(inode); + int err = 0; + + if (!size && !inline_size) + return -ENODATA; + + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + *xe = __find_inline_xattr(txattr_addr, &last_addr, + index, len, name); + if (*xe) + goto check; + } + + /* read from xattr node block */ + if (xnid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + goto out; + } + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, size); + f2fs_put_page(xpage, 1); + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, index, len, name); +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + kzfree(txattr_addr); + return err; +} + static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -252,19 +357,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -286,14 +391,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; @@ -339,7 +442,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); @@ -348,23 +451,20 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - - sizeof(struct node_footer)); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); - /* need to checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); return 0; } int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { - struct f2fs_xattr_entry *entry; - void *base_addr; + struct f2fs_xattr_entry *entry = NULL; int error = 0; - size_t size, len; + unsigned int size, len; + void *base_addr = NULL; if (name == NULL) return -EINVAL; @@ -373,21 +473,18 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - error = read_all_xattrs(inode, ipage, &base_addr); + down_read(&F2FS_I(inode)->i_xattr_sem); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; - entry = __find_xattr(base_addr, index, len, name); - if (IS_XATTR_LAST_ENTRY(entry)) { - error = -ENODATA; - goto cleanup; - } - size = le16_to_cpu(entry->e_value_size); if (buffer && size > buffer_size) { error = -ERANGE; - goto cleanup; + goto out; } if (buffer) { @@ -395,8 +492,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, memcpy(buffer, pval, size); } error = size; - -cleanup: +out: kzfree(base_addr); return error; } @@ -409,7 +505,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -445,6 +543,15 @@ cleanup: return error; } +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); +} + static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) @@ -479,12 +586,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - if ((flags & XATTR_REPLACE) && !found) { + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (f2fs_xattr_value_same(here, value, size)) + goto exit; + } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; - } else if ((flags & XATTR_CREATE) && found) { - error = -EEXIST; - goto exit; } last = here; @@ -554,7 +666,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: @@ -578,7 +690,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index f990de20cdcd..dbcd1d16e669 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) @@ -72,9 +72,10 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) - -#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ - sizeof(struct node_footer) - sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) #define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ sizeof(struct f2fs_xattr_header) - \ diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 5d5ddaa84b21..67f940892ef8 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) config = 0; rcu_read_lock(); - confkey = user_key_payload(key); + confkey = user_key_payload_rcu(key); buf = confkey->data; for (len = confkey->datalen - 1; len >= 0; len--) { diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index c444285bb1b1..835c163f61af 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, if (ret < 0) goto out_up; - payload = user_key_payload(rkey); + payload = user_key_payload_rcu(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; diff --git a/include/keys/user-type.h b/include/keys/user-type.h index c56fef40f53e..e098cbe27db5 100644 --- a/include/keys/user-type.h +++ b/include/keys/user-type.h @@ -48,9 +48,14 @@ extern void user_describe(const struct key *user, struct seq_file *m); extern long user_read(const struct key *key, char __user *buffer, size_t buflen); -static inline const struct user_key_payload *user_key_payload(const struct key *key) +static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key) { - return (struct user_key_payload *)rcu_dereference_key(key); + return (struct user_key_payload *)dereference_key_rcu(key); +} + +static inline struct user_key_payload *user_key_payload_locked(const struct key *key) +{ + return (struct user_key_payload *)dereference_key_locked((struct key *)key); } #endif /* CONFIG_KEYS */ diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index e46e7d10312b..2a0c453d7235 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -32,9 +32,15 @@ /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 -#define F2FS_ROOT_INO(sbi) (sbi->root_ino_num) -#define F2FS_NODE_INO(sbi) (sbi->node_ino_num) -#define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_ROOT_INO(sbi) ((sbi)->root_ino_num) +#define F2FS_NODE_INO(sbi) ((sbi)->node_ino_num) +#define F2FS_META_INO(sbi) ((sbi)->meta_ino_num) + +#define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ +#define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE_BITS(sbi) ((sbi)->write_io_size_bits) /* power of 2 */ +#define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) @@ -52,10 +58,17 @@ #define VERSION_LEN 256 #define MAX_VOLUME_NAME 512 +#define MAX_PATH_LEN 64 +#define MAX_DEVICES 8 /* * For superblock */ +struct f2fs_device { + __u8 path[MAX_PATH_LEN]; + __le32 total_segments; +} __packed; + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -94,12 +107,15 @@ struct f2fs_super_block { __le32 feature; /* defined features */ __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ - __u8 reserved[871]; /* valid reserved region */ + struct f2fs_device devs[MAX_DEVICES]; /* device list */ + __u8 reserved[327]; /* valid reserved region */ } __packed; /* * For checkpoint */ +#define CP_TRIMMED_FLAG 0x00000100 +#define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 @@ -146,7 +162,7 @@ struct f2fs_checkpoint { */ #define F2FS_ORPHANS_PER_BLOCK 1020 -#define GET_ORPHAN_BLOCKS(n) ((n + F2FS_ORPHANS_PER_BLOCK - 1) / \ +#define GET_ORPHAN_BLOCKS(n) (((n) + F2FS_ORPHANS_PER_BLOCK - 1) / \ F2FS_ORPHANS_PER_BLOCK) struct f2fs_orphan_block { @@ -170,6 +186,8 @@ struct f2fs_extent { #define F2FS_NAME_LEN 255 #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ +#define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ + get_extra_isize(inode)) #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ #define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ @@ -189,9 +207,7 @@ struct f2fs_extent { #define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */ #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ - -#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \ - F2FS_INLINE_XATTR_ADDRS - 1)) +#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -219,8 +235,16 @@ struct f2fs_inode { struct f2fs_extent i_ext; /* caching a largest extent */ - __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ - + union { + struct { + __le16 i_extra_isize; /* extra inode attribute size */ + __le16 i_padding; /* padding */ + __le32 i_projid; /* project id */ + __le32 i_inode_checksum;/* inode meta checksum */ + __le32 i_extra_end[0]; /* for attribute size calculation */ + }; + __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ + }; __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), double_indirect(1) node id */ } __packed; @@ -264,6 +288,7 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ @@ -439,7 +464,7 @@ typedef __le32 f2fs_hash_t; #define F2FS_SLOT_LEN 8 #define F2FS_SLOT_LEN_BITS 3 -#define GET_DENTRY_SLOTS(x) ((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS) +#define GET_DENTRY_SLOTS(x) (((x) + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS) /* MAX level for dir lookup */ #define MAX_DIR_HASH_DEPTH 63 @@ -448,7 +473,7 @@ typedef __le32 f2fs_hash_t; #define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) /* - * space utilization of regular dentry and inline dentry + * space utilization of regular dentry and inline dentry (w/o extra reservation) * regular dentry inline dentry * bitmap 1 * 27 = 27 1 * 23 = 23 * reserved 1 * 3 = 3 1 * 7 = 7 @@ -484,24 +509,6 @@ struct f2fs_dentry_block { __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN]; } __packed; -/* for inline dir */ -#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - BITS_PER_BYTE + 1)) -#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \ - BITS_PER_BYTE - 1) / BITS_PER_BYTE) -#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE)) - -/* inline directory entry structure */ -struct f2fs_inline_dentry { - __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE]; - __u8 reserved[INLINE_RESERVED_SIZE]; - struct f2fs_dir_entry dentry[NR_INLINE_DENTRY]; - __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN]; -} __packed; - /* file types used in inode_info->flags */ enum { F2FS_FT_UNKNOWN, @@ -517,4 +524,6 @@ enum { #define S_SHIFT 12 +#define F2FS_DEF_PROJID 0 /* default project ID */ + #endif /* _LINUX_F2FS_FS_H */ diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h new file mode 100644 index 000000000000..4022c61f7e9b --- /dev/null +++ b/include/linux/fscrypt_common.h @@ -0,0 +1,138 @@ +/* + * fscrypt_common.h: common declarations for per-file encryption + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPT_COMMON_H +#define _LINUX_FSCRYPT_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_CRYPTO_BLOCK_SIZE 16 + +struct fscrypt_info; + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ +}; + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * fscrypt superblock flags + */ +#define FS_CFLG_OWN_PAGES (1U << 1) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + unsigned int flags; + const char *key_prefix; + int (*get_context)(struct inode *, void *, size_t); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) +{ + if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && + filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + return true; + + if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + return true; + + return false; +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(const struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +#endif /* _LINUX_FSCRYPT_COMMON_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h new file mode 100644 index 000000000000..ec406aed2f2f --- /dev/null +++ b/include/linux/fscrypt_notsupp.h @@ -0,0 +1,177 @@ +/* + * fscrypt_notsupp.h + * + * This stubs out the fscrypt functions for filesystems configured without + * encryption support. + */ + +#ifndef _LINUX_FSCRYPT_NOTSUPP_H +#define _LINUX_FSCRYPT_NOTSUPP_H + +#include + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, + gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + return; +} + +static inline struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_decrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num) +{ + return -EOPNOTSUPP; +} + + +static inline void fscrypt_restore_control_page(struct page *page) +{ + return; +} + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + return; +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + return; +} + +/* policy.c */ +static inline int fscrypt_ioctl_set_policy(struct file *filp, + const void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_has_permitted_context(struct inode *parent, + struct inode *child) +{ + return 0; +} + +static inline int fscrypt_inherit_context(struct inode *parent, + struct inode *child, + void *fs_data, bool preload) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_get_encryption_info(struct inode *inode) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_put_encryption_info(struct inode *inode, + struct fscrypt_info *ci) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, + u32 ilen) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, + u32 ilen, + struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) +{ + return; +} + +static inline int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + /* Encryption support disabled; use standard comparison */ + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + +/* bio.c */ +static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, + struct bio *bio) +{ + return; +} + +static inline void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + return; +} + +static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + return -EOPNOTSUPP; +} + +#endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h new file mode 100644 index 000000000000..32e2fcf13b01 --- /dev/null +++ b/include/linux/fscrypt_supp.h @@ -0,0 +1,145 @@ +/* + * fscrypt_supp.h + * + * This is included by filesystems configured with encryption support. + */ + +#ifndef _LINUX_FSCRYPT_SUPP_H +#define _LINUX_FSCRYPT_SUPP_H + +#include + +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, + unsigned int, unsigned int, + u64, gfp_t); +extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, + unsigned int, u64); +extern void fscrypt_restore_control_page(struct page *); + +extern const struct dentry_operations fscrypt_d_ops; + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + d_set_d_op(dentry, &fscrypt_d_ops); +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +} + +/* policy.c */ +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + kfree(fname->crypto_buf.name); +} + +extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); + +#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 + +/* Extracts the second-to-last ciphertext block; see explanation below */ +#define FSCRYPT_FNAME_DIGEST(name, len) \ + ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \ + FS_CRYPTO_BLOCK_SIZE)) + +#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE + +/** + * fscrypt_digested_name - alternate identifier for an on-disk filename + * + * When userspace lists an encrypted directory without access to the key, + * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE + * bytes are shown in this abbreviated form (base64-encoded) rather than as the + * full ciphertext (base64-encoded). This is necessary to allow supporting + * filenames up to NAME_MAX bytes, since base64 encoding expands the length. + * + * To make it possible for filesystems to still find the correct directory entry + * despite not knowing the full on-disk name, we encode any filesystem-specific + * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups, + * followed by the second-to-last ciphertext block of the filename. Due to the + * use of the CBC-CTS encryption mode, the second-to-last ciphertext block + * depends on the full plaintext. (Note that ciphertext stealing causes the + * last two blocks to appear "flipped".) This makes accidental collisions very + * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they + * share the same filesystem-specific hashes. + * + * However, this scheme isn't immune to intentional collisions, which can be + * created by anyone able to create arbitrary plaintext filenames and view them + * without the key. Making the "digest" be a real cryptographic hash like + * SHA-256 over the full ciphertext would prevent this, although it would be + * less efficient and harder to implement, especially since the filesystem would + * need to calculate it for each directory entry examined during a search. + */ +struct fscrypt_digested_name { + u32 hash; + u32 minor_hash; + u8 digest[FSCRYPT_FNAME_DIGEST_SIZE]; +}; + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and a filename in it is + * very long, then we won't have the full disk_name and we'll instead need to + * match against the fscrypt_digested_name. + * + * Return: %true if the name matches, otherwise %false. + */ +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + if (unlikely(!fname->disk_name.name)) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_')) + return false; + if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) + return false; + return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len), + n->digest, FSCRYPT_FNAME_DIGEST_SIZE); + } + + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + +/* bio.c */ +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, + unsigned int); + +#endif /* _LINUX_FSCRYPT_SUPP_H */ diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h deleted file mode 100644 index f6dfc2950f76..000000000000 --- a/include/linux/fscrypto.h +++ /dev/null @@ -1,409 +0,0 @@ -/* - * General per-file encryption definition - * - * Copyright (C) 2015, Google, Inc. - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ - -#ifndef _LINUX_FSCRYPTO_H -#define _LINUX_FSCRYPTO_H - -#include -#include -#include -#include -#include -#include -#include - -#define FS_KEY_DERIVATION_NONCE_SIZE 16 -#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 - -#define FS_POLICY_FLAGS_PAD_4 0x00 -#define FS_POLICY_FLAGS_PAD_8 0x01 -#define FS_POLICY_FLAGS_PAD_16 0x02 -#define FS_POLICY_FLAGS_PAD_32 0x03 -#define FS_POLICY_FLAGS_PAD_MASK 0x03 -#define FS_POLICY_FLAGS_VALID 0x03 - -/* Encryption algorithms */ -#define FS_ENCRYPTION_MODE_INVALID 0 -#define FS_ENCRYPTION_MODE_AES_256_XTS 1 -#define FS_ENCRYPTION_MODE_AES_256_GCM 2 -#define FS_ENCRYPTION_MODE_AES_256_CBC 3 -#define FS_ENCRYPTION_MODE_AES_256_CTS 4 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct fscrypt_context { - u8 format; - u8 contents_encryption_mode; - u8 filenames_encryption_mode; - u8 flags; - u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; - u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; -} __packed; - -/* Encryption parameters */ -#define FS_XTS_TWEAK_SIZE 16 -#define FS_AES_128_ECB_KEY_SIZE 16 -#define FS_AES_256_GCM_KEY_SIZE 32 -#define FS_AES_256_CBC_KEY_SIZE 32 -#define FS_AES_256_CTS_KEY_SIZE 32 -#define FS_AES_256_XTS_KEY_SIZE 64 -#define FS_MAX_KEY_SIZE 64 - -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 - -/* This is passed in from userspace into the kernel keyring */ -struct fscrypt_key { - u32 mode; - u8 raw[FS_MAX_KEY_SIZE]; - u32 size; -} __packed; - -struct fscrypt_info { - u8 ci_data_mode; - u8 ci_filename_mode; - u8 ci_flags; - struct crypto_skcipher *ci_ctfm; - u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; -}; - -#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define FS_WRITE_PATH_FL 0x00000002 - -struct fscrypt_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - u8 flags; /* Flags */ - u8 mode; /* Encryption mode for tfm */ -}; - -struct fscrypt_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_FS_COMPLETION_RESULT(ecr) \ - struct fscrypt_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -#define FS_FNAME_NUM_SCATTER_ENTRIES 4 -#define FS_CRYPTO_BLOCK_SIZE 16 -#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct fscrypt_symlink_data { - __le16 len; - char encrypted_path[1]; -} __packed; - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 fscrypt_symlink_data_len(u32 l) -{ - if (l < FS_CRYPTO_BLOCK_SIZE) - l = FS_CRYPTO_BLOCK_SIZE; - return (l + sizeof(struct fscrypt_symlink_data) - 1); -} - -struct fscrypt_str { - unsigned char *name; - u32 len; -}; - -struct fscrypt_name { - const struct qstr *usr_fname; - struct fscrypt_str disk_name; - u32 hash; - u32 minor_hash; - struct fscrypt_str crypto_buf; -}; - -#define FSTR_INIT(n, l) { .name = n, .len = l } -#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) -#define fname_name(p) ((p)->disk_name.name) -#define fname_len(p) ((p)->disk_name.len) - -/* - * crypto opertions for filesystems - */ -struct fscrypt_operations { - int (*get_context)(struct inode *, void *, size_t); - int (*key_prefix)(struct inode *, u8 **); - int (*prepare_context)(struct inode *); - int (*set_context)(struct inode *, const void *, size_t, void *); - int (*dummy_context)(struct inode *); - bool (*is_encrypted)(struct inode *); - bool (*empty_dir)(struct inode *); - unsigned (*max_namelen)(struct inode *); -}; - -static inline bool fscrypt_dummy_context_enabled(struct inode *inode) -{ - if (inode->i_sb->s_cop->dummy_context && - inode->i_sb->s_cop->dummy_context(inode)) - return true; - return false; -} - -static inline bool fscrypt_valid_contents_enc_mode(u32 mode) -{ - return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); -} - -static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) -{ - return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); -} - -static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) -{ - if (str->len == 1 && str->name[0] == '.') - return true; - - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') - return true; - - return false; -} - -static inline struct page *fscrypt_control_page(struct page *page) -{ -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - return ((struct fscrypt_ctx *)page_private(page))->w.control_page; -#else - WARN_ON_ONCE(1); - return ERR_PTR(-EINVAL); -#endif -} - -static inline int fscrypt_has_encryption_key(struct inode *inode) -{ -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - return (inode->i_crypt_info != NULL); -#else - return 0; -#endif -} - -static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) -{ -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; - spin_unlock(&dentry->d_lock); -#endif -} - -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) -extern const struct dentry_operations fscrypt_d_ops; -#endif - -static inline void fscrypt_set_d_op(struct dentry *dentry) -{ -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - d_set_d_op(dentry, &fscrypt_d_ops); -#endif -} - -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) -/* crypto.c */ -extern struct kmem_cache *fscrypt_info_cachep; -int fscrypt_initialize(void); - -extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); -extern void fscrypt_release_ctx(struct fscrypt_ctx *); -extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); -extern int fscrypt_decrypt_page(struct page *); -extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); -extern void fscrypt_pullback_bio_page(struct page **, bool); -extern void fscrypt_restore_control_page(struct page *); -extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, - unsigned int); -/* policy.c */ -extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); -extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); -extern int fscrypt_has_permitted_context(struct inode *, struct inode *); -extern int fscrypt_inherit_context(struct inode *, struct inode *, - void *, bool); -/* keyinfo.c */ -extern int fscrypt_get_encryption_info(struct inode *); -extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); - -/* fname.c */ -extern int fscrypt_setup_filename(struct inode *, const struct qstr *, - int lookup, struct fscrypt_name *); -extern void fscrypt_free_filename(struct fscrypt_name *); -extern u32 fscrypt_fname_encrypted_size(struct inode *, u32); -extern int fscrypt_fname_alloc_buffer(struct inode *, u32, - struct fscrypt_str *); -extern void fscrypt_fname_free_buffer(struct fscrypt_str *); -extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, - const struct fscrypt_str *, struct fscrypt_str *); -extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, - struct fscrypt_str *); -#endif - -/* crypto.c */ -static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, - gfp_t f) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) -{ - return; -} - -static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, - struct page *p, gfp_t f) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static inline int fscrypt_notsupp_decrypt_page(struct page *p) -{ - return -EOPNOTSUPP; -} - -static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c, - struct bio *b) -{ - return; -} - -static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b) -{ - return; -} - -static inline void fscrypt_notsupp_restore_control_page(struct page *p) -{ - return; -} - -static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, - sector_t s, unsigned int f) -{ - return -EOPNOTSUPP; -} - -/* policy.c */ -static inline int fscrypt_notsupp_process_policy(struct file *f, - const struct fscrypt_policy *p) -{ - return -EOPNOTSUPP; -} - -static inline int fscrypt_notsupp_get_policy(struct inode *i, - struct fscrypt_policy *p) -{ - return -EOPNOTSUPP; -} - -static inline int fscrypt_notsupp_has_permitted_context(struct inode *p, - struct inode *i) -{ - return 0; -} - -static inline int fscrypt_notsupp_inherit_context(struct inode *p, - struct inode *i, void *v, bool b) -{ - return -EOPNOTSUPP; -} - -/* keyinfo.c */ -static inline int fscrypt_notsupp_get_encryption_info(struct inode *i) -{ - return -EOPNOTSUPP; -} - -static inline void fscrypt_notsupp_put_encryption_info(struct inode *i, - struct fscrypt_info *f) -{ - return; -} - - /* fname.c */ -static inline int fscrypt_notsupp_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, struct fscrypt_name *fname) -{ - if (dir->i_sb->s_cop->is_encrypted(dir)) - return -EOPNOTSUPP; - - memset(fname, 0, sizeof(struct fscrypt_name)); - fname->usr_fname = iname; - fname->disk_name.name = (unsigned char *)iname->name; - fname->disk_name.len = iname->len; - return 0; -} - -static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname) -{ - return; -} - -static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s) -{ - /* never happens */ - WARN_ON(1); - return 0; -} - -static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode, - u32 ilen, struct fscrypt_str *crypto_str) -{ - return -EOPNOTSUPP; -} - -static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c) -{ - return; -} - -static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode, - u32 hash, u32 minor_hash, - const struct fscrypt_str *iname, - struct fscrypt_str *oname) -{ - return -EOPNOTSUPP; -} - -static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct fscrypt_str *oname) -{ - return -EOPNOTSUPP; -} -#endif /* _LINUX_FSCRYPTO_H */ diff --git a/include/linux/key.h b/include/linux/key.h index 6a544726903e..2de88a3ebee0 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -356,7 +356,10 @@ static inline bool key_is_instantiated(const struct key *key) !test_bit(KEY_FLAG_NEGATIVE, &key->flags); } -#define rcu_dereference_key(KEY) \ +#define dereference_key_rcu(KEY) \ + (rcu_dereference((KEY)->payload.rcu_data0)) + +#define dereference_key_locked(KEY) \ (rcu_dereference_protected((KEY)->payload.rcu_data0, \ rwsem_is_locked(&((struct key *)(KEY))->sem))) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 903a09165bb1..94fbc2776364 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -6,8 +6,8 @@ #include -#define show_dev(entry) MAJOR(entry->dev), MINOR(entry->dev) -#define show_dev_ino(entry) show_dev(entry), (unsigned long)entry->ino +#define show_dev(dev) MAJOR(dev), MINOR(dev) +#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); @@ -15,6 +15,8 @@ TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(INMEM); TRACE_DEFINE_ENUM(INMEM_DROP); +TRACE_DEFINE_ENUM(INMEM_INVALIDATE); +TRACE_DEFINE_ENUM(INMEM_REVOKE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); @@ -42,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); +TRACE_DEFINE_ENUM(CP_TRIMMED); #define show_block_type(type) \ __print_symbolic(type, \ @@ -51,32 +54,40 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) -#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | WRITE_FLUSH_FUA)) -#define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) +#define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_META | REQ_PRIO | \ + REQ_PREFLUSH | REQ_FUA) +#define F2FS_BIO_FLAG_MASK(t) (t & F2FS_OP_FLAGS) -#define show_bio_type(op_flags) show_bio_op_flags(op_flags), \ - show_bio_extra(op_flags) +#define show_bio_type(op,op_flags) show_bio_op(op), \ + show_bio_op_flags(op_flags) + +#define show_bio_op(op) \ + __print_symbolic(op, \ + { REQ_OP_READ, "READ" }, \ + { REQ_OP_WRITE, "WRITE" }, \ + { REQ_OP_DISCARD, "DISCARD" }, \ + { REQ_OP_SECURE_ERASE, "SECURE_ERASE" }, \ + { REQ_OP_WRITE_SAME, "WRITE_SAME" }) #define show_bio_op_flags(flags) \ - __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \ - { 0, "WRITE" }, \ - { REQ_RAHEAD, "READAHEAD" }, \ - { READ_SYNC, "READ_SYNC" }, \ - { WRITE_SYNC, "WRITE_SYNC" }, \ - { WRITE_FLUSH, "WRITE_FLUSH" }, \ - { WRITE_FUA, "WRITE_FUA" }, \ - { WRITE_FLUSH_FUA, "WRITE_FLUSH_FUA" }) + __print_flags(F2FS_BIO_FLAG_MASK(flags), "|", \ + { REQ_RAHEAD, "R" }, \ + { REQ_SYNC, "S" }, \ + { REQ_META, "M" }, \ + { REQ_PRIO, "P" }, \ + { REQ_PREFLUSH, "PF" }, \ + { REQ_FUA, "FUA" }) -#define show_bio_extra(type) \ - __print_symbolic(F2FS_BIO_EXTRA_MASK(type), \ - { REQ_META, "(M)" }, \ - { REQ_PRIO, "(P)" }, \ - { REQ_META | REQ_PRIO, "(MP)" }, \ - { 0, " \b" }) +#define show_block_temp(temp) \ + __print_symbolic(temp, \ + { HOT, "HOT" }, \ + { WARM, "WARM" }, \ + { COLD, "COLD" }) #define show_data_type(type) \ __print_symbolic(type, \ @@ -109,12 +120,14 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { GC_CB, "Cost-Benefit" }) #define show_cpreason(type) \ - __print_symbolic(type, \ + __print_flags(type, "|", \ { CP_UMOUNT, "Umount" }, \ { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ - { CP_DISCARD, "Discard" }) + { CP_DISCARD, "Discard" }, \ + { CP_UMOUNT, "Umount" }, \ + { CP_TRIMMED, "Trimmed" }) struct victim_sel_policy; struct f2fs_map_blocks; @@ -237,7 +250,7 @@ TRACE_EVENT(f2fs_sync_fs, ), TP_printk("dev = (%d,%d), superblock is %s, wait = %d", - show_dev(__entry), + show_dev(__entry->dev), __entry->dirty ? "dirty" : "not dirty", __entry->wait) ); @@ -307,6 +320,13 @@ DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, TP_ARGS(inode, ret) ); +DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode, + + TP_PROTO(struct inode *inode, int ret), + + TP_ARGS(inode, ret) +); + DEFINE_EVENT(f2fs__inode, f2fs_truncate, TP_PROTO(struct inode *inode), @@ -516,14 +536,14 @@ TRACE_EVENT(f2fs_map_blocks, TRACE_EVENT(f2fs_background_gc, - TP_PROTO(struct super_block *sb, long wait_ms, + TP_PROTO(struct super_block *sb, unsigned int wait_ms, unsigned int prefree, unsigned int free), TP_ARGS(sb, wait_ms, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) - __field(long, wait_ms) + __field(unsigned int, wait_ms) __field(unsigned int, prefree) __field(unsigned int, free) ), @@ -535,13 +555,120 @@ TRACE_EVENT(f2fs_background_gc, __entry->free = free; ), - TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", - show_dev(__entry), + TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u", + show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, __entry->free) ); +TRACE_EVENT(f2fs_gc_begin, + + TP_PROTO(struct super_block *sb, bool sync, bool background, + long long dirty_nodes, long long dirty_dents, + long long dirty_imeta, unsigned int free_sec, + unsigned int free_seg, int reserved_seg, + unsigned int prefree_seg), + + TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(bool, sync) + __field(bool, background) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->sync = sync; + __entry->background = background; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + "rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->sync, + __entry->background, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + +TRACE_EVENT(f2fs_gc_end, + + TP_PROTO(struct super_block *sb, int ret, int seg_freed, + int sec_freed, long long dirty_nodes, + long long dirty_dents, long long dirty_imeta, + unsigned int free_sec, unsigned int free_seg, + int reserved_seg, unsigned int prefree_seg), + + TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents, + dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, ret) + __field(int, seg_freed) + __field(int, sec_freed) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->ret = ret; + __entry->seg_freed = seg_freed; + __entry->sec_freed = sec_freed; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, " + "free_seg:%u, rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->ret, + __entry->seg_freed, + __entry->sec_freed, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, @@ -557,6 +684,7 @@ TRACE_EVENT(f2fs_get_victim, __field(int, alloc_mode) __field(int, gc_mode) __field(unsigned int, victim) + __field(unsigned int, cost) __field(unsigned int, ofs_unit) __field(unsigned int, pre_victim) __field(unsigned int, prefree) @@ -570,20 +698,23 @@ TRACE_EVENT(f2fs_get_victim, __entry->alloc_mode = p->alloc_mode; __entry->gc_mode = p->gc_mode; __entry->victim = p->min_segno; + __entry->cost = p->min_cost; __entry->ofs_unit = p->ofs_unit; __entry->pre_victim = pre_victim; __entry->prefree = prefree; __entry->free = free; ), - TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u " - "ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u", - show_dev(__entry), + TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), " + "victim = %u, cost = %u, ofs_unit = %u, " + "pre_victim_secno = %d, prefree = %u, free = %u", + show_dev(__entry->dev), show_data_type(__entry->type), show_gc_type(__entry->gc_type), show_alloc_mode(__entry->alloc_mode), show_victim_policy(__entry->gc_mode), __entry->victim, + __entry->cost, __entry->ofs_unit, (int)__entry->pre_victim, __entry->prefree, @@ -715,7 +846,7 @@ TRACE_EVENT(f2fs_reserve_new_blocks, ), TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", - show_dev(__entry), + show_dev(__entry->dev), (unsigned int)__entry->nid, __entry->ofs_in_node, (unsigned long long)__entry->count) @@ -735,6 +866,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(block_t, new_blkaddr) __field(int, op) __field(int, op_flags) + __field(int, temp) __field(int, type) ), @@ -746,16 +878,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->new_blkaddr = fio->new_blkaddr; __entry->op = fio->op; __entry->op_flags = fio->op_flags; + __entry->temp = fio->temp; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, - show_bio_type(__entry->op_flags), + show_bio_type(__entry->op, __entry->op_flags), + show_block_temp(__entry->temp), show_block_type(__entry->type)) ); @@ -768,7 +902,7 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio, TP_CONDITION(page->mapping) ); -DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, +DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write, TP_PROTO(struct page *page, struct f2fs_io_info *fio), @@ -777,15 +911,15 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, TP_CONDITION(page->mapping) ); -DECLARE_EVENT_CLASS(f2fs__submit_bio, +DECLARE_EVENT_CLASS(f2fs__bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_STRUCT__entry( __field(dev_t, dev) + __field(dev_t, target) __field(int, op) __field(int, op_flags) __field(int, type) @@ -795,37 +929,55 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->op = fio->op; - __entry->op_flags = fio->op_flags; - __entry->type = fio->type; + __entry->target = bio->bi_bdev->bd_dev; + __entry->op = bio_op(bio); + __entry->op_flags = bio->bi_opf; + __entry->type = type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; ), - TP_printk("dev = (%d,%d), rw = %s%s, %s, sector = %lld, size = %u", - show_dev(__entry), - show_bio_type(__entry->op_flags), + TP_printk("dev = (%d,%d)/(%d,%d), rw = %s(%s), %s, sector = %lld, size = %u", + show_dev(__entry->target), + show_dev(__entry->dev), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, __entry->size) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); @@ -1084,16 +1236,16 @@ TRACE_EVENT(f2fs_write_checkpoint, ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", - show_dev(__entry), + show_dev(__entry->dev), show_cpreason(__entry->reason), __entry->msg) ); -TRACE_EVENT(f2fs_issue_discard, +DECLARE_EVENT_CLASS(f2fs_discard, - TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen), + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), - TP_ARGS(sb, blkstart, blklen), + TP_ARGS(dev, blkstart, blklen), TP_STRUCT__entry( __field(dev_t, dev) @@ -1102,40 +1254,78 @@ TRACE_EVENT(f2fs_issue_discard, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; __entry->blklen = blklen; ), TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart, (unsigned long long)__entry->blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + +DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + +TRACE_EVENT(f2fs_issue_reset_zone, + + TP_PROTO(struct block_device *dev, block_t blkstart), + + TP_ARGS(dev, blkstart), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(block_t, blkstart) + ), + + TP_fast_assign( + __entry->dev = dev->bd_dev; + __entry->blkstart = blkstart; + ), + + TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", + show_dev(__entry->dev), + (unsigned long long)__entry->blkstart) +); + TRACE_EVENT(f2fs_issue_flush, - TP_PROTO(struct super_block *sb, unsigned int nobarrier, - unsigned int flush_merge), + TP_PROTO(struct block_device *dev, unsigned int nobarrier, + unsigned int flush_merge, int ret), - TP_ARGS(sb, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) + __field(int, ret) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; + __entry->ret = ret; ), - TP_printk("dev = (%d,%d), %s %s", - show_dev(__entry), + TP_printk("dev = (%d,%d), %s %s, ret = %d", + show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", - __entry->flush_merge ? " with flush_merge" : "") + __entry->flush_merge ? " with flush_merge" : "", + __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, @@ -1248,7 +1438,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree, ), TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u", - show_dev(__entry), + show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt) ); @@ -1295,7 +1485,7 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, ), TP_printk("dev = (%d,%d), %s, dirty count = %lld", - show_dev(__entry), + show_dev(__entry->dev), show_file_type(__entry->type), __entry->count) ); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 11c030782426..7683188755b1 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -257,18 +257,47 @@ struct fsxattr { /* Policy provided via an ioctl on the topmost directory */ #define FS_KEY_DESCRIPTOR_SIZE 8 +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 +#define FS_ENCRYPTION_MODE_AES_128_CBC 5 +#define FS_ENCRYPTION_MODE_AES_128_CTS 6 + struct fscrypt_policy { __u8 version; __u8 contents_encryption_mode; __u8 filenames_encryption_mode; __u8 flags; __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; -} __packed; +}; #define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) #define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +/* Parameters for passing an encryption key into the kernel keyring */ +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* Structure that userspace passes to the kernel keyring */ +#define FS_MAX_KEY_SIZE 64 + +struct fscrypt_key { + __u32 mode; + __u8 raw[FS_MAX_KEY_SIZE]; + __u32 size; +}; + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) * diff --git a/lib/digsig.c b/lib/digsig.c index 55b8b2f41a9e..03d7c63837ae 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -85,7 +85,7 @@ static int digsig_verify_rsa(struct key *key, struct pubkey_hdr *pkh; down_read(&key->sem); - ukp = user_key_payload(key); + ukp = user_key_payload_locked(key); if (ukp->datalen < sizeof(*pkh)) goto err1; diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index ecc28cff08ab..d502c94b1a82 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -70,7 +70,7 @@ int dns_query(const char *type, const char *name, size_t namelen, const char *options, char **_result, time64_t *_expiry) { struct key *rkey; - const struct user_key_payload *upayload; + struct user_key_payload *upayload; const struct cred *saved_cred; size_t typelen, desclen; char *desc, *cp; @@ -141,7 +141,7 @@ int dns_query(const char *type, const char *name, size_t namelen, if (ret) goto put; - upayload = user_key_payload(rkey); + upayload = user_key_payload_locked(rkey); len = upayload->datalen; ret = -ENOMEM; diff --git a/security/keys/dh.c b/security/keys/dh.c index 531ed2ec132f..893af4c45038 100644 --- a/security/keys/dh.c +++ b/security/keys/dh.c @@ -55,7 +55,7 @@ static ssize_t mpi_from_key(key_serial_t keyid, size_t maxlen, MPI *mpi) if (status == 0) { const struct user_key_payload *payload; - payload = user_key_payload(key); + payload = user_key_payload_locked(key); if (maxlen == 0) { *mpi = NULL; diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 8d9330aba77e..a62eba231899 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -314,7 +314,7 @@ static struct key *request_user_key(const char *master_desc, const u8 **master_k goto error; down_read(&ukey->sem); - upayload = user_key_payload(ukey); + upayload = user_key_payload_locked(ukey); *master_key = upayload->data; *master_keylen = upayload->datalen; error: @@ -927,7 +927,7 @@ static long encrypted_read(const struct key *key, char __user *buffer, size_t asciiblob_len; int ret; - epayload = rcu_dereference_key(key); + epayload = dereference_key_locked(key); /* returns the hex encoded iv, encrypted-data, and hmac as ascii */ asciiblob_len = epayload->datablob_len + ivsize + 1 diff --git a/security/keys/trusted.c b/security/keys/trusted.c index 90d61751ff12..2ae31c5a87de 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -1140,12 +1140,12 @@ out: static long trusted_read(const struct key *key, char __user *buffer, size_t buflen) { - struct trusted_key_payload *p; + const struct trusted_key_payload *p; char *ascii_buf; char *bufp; int i; - p = rcu_dereference_key(key); + p = dereference_key_locked(key); if (!p) return -EINVAL; if (!buffer || buflen <= 0) diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c index 66b1840b4110..fa880f6a1924 100644 --- a/security/keys/user_defined.c +++ b/security/keys/user_defined.c @@ -107,7 +107,7 @@ int user_update(struct key *key, struct key_preparsed_payload *prep) /* attach the new data, displacing the old */ key->expiry = prep->expiry; if (!test_bit(KEY_FLAG_NEGATIVE, &key->flags)) - zap = rcu_dereference_key(key); + zap = dereference_key_locked(key); rcu_assign_keypointer(key, prep->payload.data[0]); prep->payload.data[0] = NULL; @@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(user_update); */ void user_revoke(struct key *key) { - struct user_key_payload *upayload = key->payload.data[0]; + struct user_key_payload *upayload = user_key_payload_locked(key); /* clear the quota */ key_payload_reserve(key, 0); @@ -169,7 +169,7 @@ long user_read(const struct key *key, char __user *buffer, size_t buflen) const struct user_key_payload *upayload; long ret; - upayload = user_key_payload(key); + upayload = user_key_payload_locked(key); ret = upayload->datalen; /* we can return the data as is */ From ed0b958299be6373655210d26b5b8af555f92968 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 2 Oct 2017 02:50:16 +0800 Subject: [PATCH 364/555] UPSTREAM: f2fs: fix potential panic during fstrim As Ju Hyung Park reported: "When 'fstrim' is called for manual trim, a BUG() can be triggered randomly with this patch. I'm seeing this issue on both x86 Desktop and arm64 Android phone. On x86 Desktop, this was caused during Ubuntu boot-up. I have a cronjob installed which calls 'fstrim -v /' during boot. On arm64 Android, this was caused during GC looping with 1ms gc_min_sleep_time & gc_max_sleep_time." Root cause of this issue is that f2fs_wait_discard_bios can only be used by f2fs_put_super, because during put_super there must be no other referrers, so it can ignore discard entry's reference count when removing the entry, otherwise in other caller we will hit bug_on in __remove_discard_cmd as there may be other issuer added reference count in discard entry. Thread A Thread B - issue_discard_thread - f2fs_ioc_fitrim - f2fs_trim_fs - f2fs_wait_discard_bios - __issue_discard_cmd - __submit_discard_cmd - __wait_discard_cmd - dc->ref++ - __wait_one_discard_bio - __wait_discard_cmd - __remove_discard_cmd - f2fs_bug_on(sbi, dc->ref) Change-Id: I8fb5c8215e6222ae853e7781218d5084e1f11166 Fixes: 969d1b180d987c2be02de890d0fff0f66a0e80de Reported-by: Ju Hyung Park Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim (cherry picked from commit 638164a2718f337ea224b747cf5977ef143166a4) --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ad16c1dd416f..461e22d0a452 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2553,7 +2553,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e71b37b00b4a..951ebd59643b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1210,11 +1210,11 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) { __issue_discard_cmd(sbi, false); __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, !umount); } static void mark_discard_range_all(struct f2fs_sb_info *sbi) @@ -2244,7 +2244,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) } /* It's time to issue all the filed discards */ mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, false); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 511bcecf522a..da59996a859f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, true); if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { From 5c73594e214f2328112d6ace6098a5a99860ec36 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Oct 2017 10:27:45 -0700 Subject: [PATCH 365/555] FROMLIST: f2fs: expose some sectors to user in inline data or dentry case (from https://patchwork.kernel.org/patch/10005409/) If there's some data written through inline data or dentry, we need to shouw st_blocks. This fixes reporting zero blocks even though there is small written data. Bug: 67651285 Bug: 67600404 Change-Id: I9ad5cb137eb627b9fd22740d2ab98e0221433c95 Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c29d723ae337..658291ca4c01 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -664,6 +664,11 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); + + /* we need to show initial sectors used for inline_data/dentries */ + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + stat->blocks += (stat->size + 511) >> 9; + return 0; } From c90c6bfac6dfd6a7367f718df370b6a19756efe5 Mon Sep 17 00:00:00 2001 From: Behan Webster Date: Fri, 21 Apr 2017 11:20:01 -0700 Subject: [PATCH 366/555] UPSTREAM: kbuild: Add better clang cross build support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add cross target to CC if using clang. Also add custom gcc toolchain path for fallback gcc tools. Clang will fallback to using things like ld, as, and libgcc if (respectively) one of the llvm linkers isn't available, the integrated assembler is turned off, or an appropriately cross-compiled version of compiler-rt isn't available. To this end, you can specify the path to this fallback gcc toolchain with GCC_TOOLCHAIN. Signed-off-by: Behan Webster Reviewed-by: Jan-Simon Möller Reviewed-by: Mark Charlebois Signed-off-by: Greg Hackmann Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit 785f11aa595bc3d4e74096cbd598ada54ecc0d81) --- Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile b/Makefile index feab5f5a507c..6595969ed89b 100644 --- a/Makefile +++ b/Makefile @@ -699,6 +699,15 @@ endif KBUILD_CFLAGS += $(stackp-flag) ifeq ($(cc-name),clang) +ifneq ($(CROSS_COMPILE),) +CLANG_TARGET := -target $(notdir $(CROSS_COMPILE:%-=%)) +GCC_TOOLCHAIN := $(realpath $(dir $(shell which $(LD)))/..) +endif +ifneq ($(GCC_TOOLCHAIN),) +CLANG_GCC_TC := -gcc-toolchain $(GCC_TOOLCHAIN) +endif +KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) +KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,) KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) From dda20e821420e8b5d190317db77a7630b8e8a294 Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Tue, 25 Apr 2017 15:47:35 -0700 Subject: [PATCH 367/555] UPSTREAM: kbuild: clang: add -no-integrated-as to KBUILD_[AC]FLAGS The Linux Kernel relies on GCC's acceptance of inline assembly as an opaque object which will not have any validation performed on the content. The current behaviour in LLVM is to perform validation of the contents by means of parsing the input if the MC layer can handle it. Disable clangs integrated assembler and use the GNU assembler instead. Wording-mostly-from: Saleem Abdulrasool Signed-off-by: Michael Davidson Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit a37c45cd82e62a361706b9688a984a3a63957321) --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 6595969ed89b..7560f0c03526 100644 --- a/Makefile +++ b/Makefile @@ -720,6 +720,8 @@ KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) # See modpost pattern 2 KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,) KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior) +KBUILD_CFLAGS += $(call cc-option, -no-integrated-as) +KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) else # These warnings generated too much noise in a regular build. From 67259a687d26ee48f0f758379ca410f55194f9ff Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 12 Apr 2017 12:43:52 -0700 Subject: [PATCH 368/555] UPSTREAM: kbuild: Consolidate header generation from ASM offset information Largely redundant code is used in different places to generate C headers from offset information extracted from assembly language output. Consolidate the code in Makefile.lib and use this instead. Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit ebf003f0cfb3705e60d40dedc3ec949176c741af) --- Kbuild | 25 ------------------------- arch/ia64/kernel/Makefile | 26 ++------------------------ scripts/Makefile.lib | 28 ++++++++++++++++++++++++++++ scripts/mod/Makefile | 28 ++-------------------------- 4 files changed, 32 insertions(+), 75 deletions(-) diff --git a/Kbuild b/Kbuild index 3d0ae152af7c..94c752762bc2 100644 --- a/Kbuild +++ b/Kbuild @@ -7,31 +7,6 @@ # 4) Check for missing system calls # 5) Generate constants.py (may need bounds.h) -# Default sed regexp - multiline due to syntax constraints -define sed-y - "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" -endef - -# Use filechk to avoid rebuilds when a header changes, but the resulting file -# does not -define filechk_offsets - (set -e; \ - echo "#ifndef $2"; \ - echo "#define $2"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y); \ - echo ""; \ - echo "#endif" ) -endef - ##### # 1) Generate bounds.h diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 3686d6abafde..9edda5466020 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -50,32 +50,10 @@ CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 # The gate DSO image is built using a special linker script. include $(src)/Makefile.gate -# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config -define sed-y - "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}" -endef -quiet_cmd_nr_irqs = GEN $@ -define cmd_nr_irqs - (set -e; \ - echo "#ifndef __ASM_NR_IRQS_H__"; \ - echo "#define __ASM_NR_IRQS_H__"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y) $<; \ - echo ""; \ - echo "#endif" ) > $@ -endef - # We use internal kbuild rules to avoid the "is up to date" message from make arch/$(SRCARCH)/kernel/nr-irqs.s: arch/$(SRCARCH)/kernel/nr-irqs.c $(Q)mkdir -p $(dir $@) $(call if_changed_dep,cc_s_c) -include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s - $(Q)mkdir -p $(dir $@) - $(call cmd,nr_irqs) +include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s FORCE + $(call filechk,offsets,__ASM_NR_IRQS_H__) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index d3d3320722cb..530fae970c0a 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -414,3 +414,31 @@ quiet_cmd_xzmisc = XZMISC $@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \ xz --check=crc32 --lzma2=dict=1MiB) > $@ || \ (rm -f $@ ; false) + +# ASM offsets +# --------------------------------------------------------------------------- + +# Default sed regexp - multiline due to syntax constraints +define sed-offsets + "/^->/{s:->#\(.*\):/* \1 */:; \ + s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:->::; p;}" +endef + +# Use filechk to avoid rebuilds when a header changes, but the resulting file +# does not +define filechk_offsets + (set -e; \ + echo "#ifndef $2"; \ + echo "#define $2"; \ + echo "/*"; \ + echo " * DO NOT MODIFY."; \ + echo " *"; \ + echo " * This file was generated by Kbuild"; \ + echo " */"; \ + echo ""; \ + sed -ne $(sed-offsets); \ + echo ""; \ + echo "#endif" ) +endef diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile index 19d9bcadc0cc..b497d9764dcf 100644 --- a/scripts/mod/Makefile +++ b/scripts/mod/Makefile @@ -7,32 +7,8 @@ modpost-objs := modpost.o file2alias.o sumversion.o devicetable-offsets-file := devicetable-offsets.h -define sed-y - "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" -endef - -quiet_cmd_offsets = GEN $@ -define cmd_offsets - (set -e; \ - echo "#ifndef __DEVICETABLE_OFFSETS_H__"; \ - echo "#define __DEVICETABLE_OFFSETS_H__"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y) $<; \ - echo ""; \ - echo "#endif" ) > $@ -endef - -$(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s - $(call if_changed,offsets) +$(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s FORCE + $(call filechk,offsets,__DEVICETABLE_OFFSETS_H__) targets += $(devicetable-offsets-file) devicetable-offsets.s From 9ba72a2322625c03ad04d5857c6655091c25851a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 21 Apr 2017 15:21:10 +0900 Subject: [PATCH 369/555] UPSTREAM: kbuild: consolidate redundant sed script ASM offset generation This part ended up in redundant code after touched by multiple people. [1] Commit 3234282f33b2 ("x86, asm: Fix CFI macro invocations to deal with shortcomings in gas") added parentheses for defined expressions to support old gas for x86. [2] Commit a22dcdb0032c ("x86, asm: Fix ancient-GAS workaround") split the pattern into two to avoid parentheses for non-numeric expressions. [3] Commit 95a2f6f72d37 ("Partially revert patch that encloses asm-offset.h numbers in brackets") removed parentheses from numeric expressions as well because parentheses in MN10300 assembly have a special meaning (pointer access). Apparently, there is a conflict between [1] and [3]. After all, [3] took precedence, and a long time has passed since then. Now, merge the two patterns again because the first one is covered by the other. Signed-off-by: Masahiro Yamada Reviewed-by: Matthias Kaehlcke (cherry picked from commit 7dd47b95b0f54f2057d40af6e66d477e3fe95d13) --- scripts/Makefile.lib | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 530fae970c0a..c285493e89b5 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -421,7 +421,6 @@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \ # Default sed regexp - multiline due to syntax constraints define sed-offsets "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ s:->::; p;}" endef From ba790f583c30575ec1705286407f213fddd03ccf Mon Sep 17 00:00:00 2001 From: Jeroen Hofstee Date: Fri, 21 Apr 2017 15:21:11 +0900 Subject: [PATCH 370/555] UPSTREAM: kbuild: fix asm-offset generation to work with clang KBuild abuses the asm statement to write to a file and clang chokes about these invalid asm statements. Hack it even more by fooling this is actual valid asm code. [masahiro: Import Jeroen's work for U-Boot: http://patchwork.ozlabs.org/patch/375026/ Tweak sed script a little to avoid garbage '#' for GCC case, like #define NR_PAGEFLAGS 23 /* __NR_PAGEFLAGS # */ ] Signed-off-by: Jeroen Hofstee Signed-off-by: Masahiro Yamada Reviewed-by: Matthias Kaehlcke Tested-by: Matthias Kaehlcke (cherry picked from commit cf0c3e68aa81f992b0301f62e341b710d385bf68) --- include/linux/kbuild.h | 6 +++--- scripts/Makefile.lib | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/kbuild.h b/include/linux/kbuild.h index 22a72198c14b..4e80f3a9ad58 100644 --- a/include/linux/kbuild.h +++ b/include/linux/kbuild.h @@ -2,14 +2,14 @@ #define __LINUX_KBUILD_H #define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val)) -#define BLANK() asm volatile("\n->" : : ) +#define BLANK() asm volatile("\n.ascii \"->\"" : : ) #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)) #define COMMENT(x) \ - asm volatile("\n->#" x) + asm volatile("\n.ascii \"->#" x "\"") #endif diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index c285493e89b5..9e447b8497c6 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -419,10 +419,14 @@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \ # --------------------------------------------------------------------------- # Default sed regexp - multiline due to syntax constraints +# +# Use [:space:] because LLVM's integrated assembler inserts around +# the .ascii directive whereas GCC keeps the as-is. define sed-offsets - "/^->/{s:->#\(.*\):/* \1 */:; \ + 's:^[[:space:]]*\.ascii[[:space:]]*"\(.*\)".*:\1:; \ + /^->/{s:->#\(.*\):/* \1 */:; \ s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" + s:->::; p;}' endef # Use filechk to avoid rebuilds when a header changes, but the resulting file From f9e396078be8a9ddf29db5e4c09cccf30716ef84 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 13 Apr 2017 07:25:21 +0900 Subject: [PATCH 371/555] UPSTREAM: kbuild: drop -Wno-unknown-warning-option from clang options Since commit c3f0d0bc5b01 ("kbuild, LLVMLinux: Add -Werror to cc-option to support clang"), cc-option and friends work nicely for clang. However, -Wno-unknown-warning-option makes clang happy with any unknown warning options even if -Werror is specified. Once -Wno-unknown-warning-option is added, any succeeding call of cc-disable-warning is evaluated positive, then unknown warning options are accepted. This should be dropped. Signed-off-by: Masahiro Yamada (cherry picked from commit a0ae981eba8f07dbc74bce38fd3a462b69a5bc8e) --- Makefile | 1 - scripts/Makefile.extrawarn | 1 - 2 files changed, 2 deletions(-) diff --git a/Makefile b/Makefile index 7560f0c03526..f048e2929507 100644 --- a/Makefile +++ b/Makefile @@ -709,7 +709,6 @@ endif KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) -KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,) KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 7c321a603b07..fb3522fd8702 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -64,7 +64,6 @@ ifeq ($(cc-name),clang) KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides) KBUILD_CFLAGS += $(call cc-disable-warning, unused-value) KBUILD_CFLAGS += $(call cc-disable-warning, format) -KBUILD_CFLAGS += $(call cc-disable-warning, unknown-warning-option) KBUILD_CFLAGS += $(call cc-disable-warning, sign-compare) KBUILD_CFLAGS += $(call cc-disable-warning, format-zero-length) KBUILD_CFLAGS += $(call cc-disable-warning, uninitialized) From 70de9b20fa2a48c5c2bc4a5c68e2a22240a07650 Mon Sep 17 00:00:00 2001 From: Mark Charlebois Date: Fri, 31 Mar 2017 22:38:13 +0200 Subject: [PATCH 372/555] UPSTREAM: kbuild, LLVMLinux: Add -Werror to cc-option to support clang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang will warn about unknown warnings but will not return false unless -Werror is set. GCC will return false if an unknown warning is passed. Adding -Werror make both compiler behave the same. [arnd: it turns out we need the same patch for testing whether -ffunction-sections works right with gcc. I've build tested extensively with this patch applied, so let's just merge this one now.] Signed-off-by: Mark Charlebois Signed-off-by: Behan Webster Reviewed-by: Jan-Simon Möller Signed-off-by: Arnd Bergmann Acked-by: Kees Cook Signed-off-by: Masahiro Yamada (cherry picked from commit c3f0d0bc5b01ad90c45276952802455750444b4f) --- scripts/Kbuild.include | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 179219845dfc..715838093ba1 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -116,12 +116,12 @@ CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) cc-option = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) # cc-option-yn # Usage: flag := $(call cc-option-yn,-march=winchip-c6) cc-option-yn = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n) # cc-option-align # Prefix align with either -falign or -malign @@ -131,7 +131,7 @@ cc-option-align = $(subst -functions=0,,\ # cc-disable-warning # Usage: cflags-y += $(call cc-disable-warning,unused-but-set-variable) cc-disable-warning = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) # cc-name # Expands to either gcc or clang From 5db0ac53e751f236c065323c503445ce6c9a5177 Mon Sep 17 00:00:00 2001 From: Behan Webster Date: Mon, 27 Mar 2017 18:19:09 -0700 Subject: [PATCH 373/555] UPSTREAM: kbuild: use -Oz instead of -Os when using clang This generates smaller resulting object code when compiled with clang. Signed-off-by: Behan Webster Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit 6748cb3c299de1ffbe56733647b01dbcc398c419) --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f048e2929507..cfb3289941c7 100644 --- a/Makefile +++ b/Makefile @@ -639,7 +639,8 @@ KBUILD_CFLAGS += $(call cc-option,-fdata-sections,) endif ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE -KBUILD_CFLAGS += -Os $(call cc-disable-warning,maybe-uninitialized,) +KBUILD_CFLAGS += $(call cc-option,-Oz,-Os) +KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,) else ifdef CONFIG_PROFILE_ALL_BRANCHES KBUILD_CFLAGS += -O2 $(call cc-disable-warning,maybe-uninitialized,) From 30c02b6d50d7c1714a8df450f5eadf68a24a997a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20Tinti?= Date: Mon, 24 Apr 2017 13:04:58 -0700 Subject: [PATCH 374/555] UPSTREAM: kbuild: Add support to generate LLVM assembly files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add rules to kbuild in order to generate LLVM assembly files with the .ll extension when using clang. # from c code make CC=clang kernel/pid.ll Signed-off-by: Vinícius Tinti Signed-off-by: Behan Webster Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit 433db3e260bc8134d4a46ddf20b3668937e12556) --- .gitignore | 1 + Makefile | 5 +++++ scripts/Makefile.build | 8 ++++++++ 3 files changed, 14 insertions(+) diff --git a/.gitignore b/.gitignore index d47ecbb26d72..4105cfbd6f26 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ *.lzo *.patch *.gcno +*.ll modules.builtin Module.symvers *.dwo diff --git a/Makefile b/Makefile index cfb3289941c7..300f01c80c11 100644 --- a/Makefile +++ b/Makefile @@ -1373,6 +1373,8 @@ help: @echo ' (default: $$(INSTALL_MOD_PATH)/lib/firmware)' @echo ' dir/ - Build all files in dir and below' @echo ' dir/file.[ois] - Build specified target only' + @echo ' dir/file.ll - Build the LLVM assembly file' + @echo ' (requires compiler support for LLVM assembly generation)' @echo ' dir/file.lst - Build specified mixed source/assembly target only' @echo ' (requires a recent binutils and recent build (System.map))' @echo ' dir/file.ko - Build module including final link' @@ -1557,6 +1559,7 @@ clean: $(clean-dirs) -o -name '*.symtypes' -o -name 'modules.order' \ -o -name modules.builtin -o -name '.tmp_*.o.*' \ -o -name '*.c.[012]*.*' \ + -o -name '*.ll' \ -o -name '*.gcno' \) -type f -print | xargs rm -f # Generate tags for editors @@ -1660,6 +1663,8 @@ endif $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@) %.symtypes: %.c prepare scripts FORCE $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@) +%.ll: %.c prepare scripts FORCE + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@) # Modules /: prepare scripts FORCE diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 7675d11ee65e..3e3efd37ba47 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -176,6 +176,14 @@ cmd_cc_symtypes_c = \ $(obj)/%.symtypes : $(src)/%.c FORCE $(call cmd,cc_symtypes_c) +# LLVM assembly +# Generate .ll files from .c +quiet_cmd_cc_ll_c = CC $(quiet_modtag) $@ + cmd_cc_ll_c = $(CC) $(c_flags) -emit-llvm -S -o $@ $< + +$(obj)/%.ll: $(src)/%.c FORCE + $(call if_changed_dep,cc_ll_c) + # C (.c) files # The C file is compiled and updated dependency information is generated. # (See cmd_cc_o_c + relevant part of rule_cc_o_c) From 2fe1b761fa48d8e62aec6cd4598982a5e8f215cd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 1 Feb 2017 18:00:14 +0100 Subject: [PATCH 375/555] UPSTREAM: modules: mark __inittest/__exittest as __maybe_unused clang warns about unused inline functions by default: arch/arm/crypto/aes-cipher-glue.c:68:1: warning: unused function '__inittest' [-Wunused-function] arch/arm/crypto/aes-cipher-glue.c:69:1: warning: unused function '__exittest' [-Wunused-function] As these appear in every single module, let's just disable the warnings by marking the two functions as __maybe_unused. Signed-off-by: Arnd Bergmann Reviewed-by: Miroslav Benes Acked-by: Rusty Russell Signed-off-by: Jessica Yu (cherry picked from commit 1f318a8bafcfba9f0d623f4870c4e890fd22e659) --- include/linux/module.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 0c3207d26ac0..1226280be63b 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -127,13 +127,13 @@ extern void cleanup_module(void); /* Each module must use one module_init(). */ #define module_init(initfn) \ - static inline initcall_t __inittest(void) \ + static inline initcall_t __maybe_unused __inittest(void) \ { return initfn; } \ int init_module(void) __attribute__((alias(#initfn))); /* This is only required if you want to be unloadable. */ #define module_exit(exitfn) \ - static inline exitcall_t __exittest(void) \ + static inline exitcall_t __maybe_unused __exittest(void) \ { return exitfn; } \ void cleanup_module(void) __attribute__((alias(#exitfn))); From f0f3d6753fc45096255001666fce5dfb8a376109 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 6 Jun 2017 13:36:24 -0700 Subject: [PATCH 376/555] UPSTREAM: compiler, clang: suppress warning for unused static inline functions GCC explicitly does not warn for unused static inline functions for -Wunused-function. The manual states: Warn whenever a static function is declared but not defined or a non-inline static function is unused. Clang does warn for static inline functions that are unused. It turns out that suppressing the warnings avoids potentially complex Suppress the warning for clang. Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds (cherry picked from commit abb2ea7dfd82451d85ce669b811310c05ab5ca46) --- include/linux/compiler-clang.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index de179993e039..ea9126006a69 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -15,3 +15,10 @@ * with any version that can compile the kernel */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) + +/* + * GCC does not warn about unused static inline functions for + * -Wunused-function. This turns out to avoid the need for complex #ifdef + * directives. Suppress the warning in clang as well. + */ +#define inline inline __attribute__((unused)) From 27836ce36ad7106bfabb7d48b6ac4053bbd29001 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 11 Jun 2017 15:51:56 -0700 Subject: [PATCH 377/555] UPSTREAM: compiler, clang: properly override 'inline' for clang Commit abb2ea7dfd82 ("compiler, clang: suppress warning for unused static inline functions") just caused more warnings due to re-defining the 'inline' macro. So undef it before re-defining it, and also add the 'notrace' attribute like the gcc version that this is overriding does. Maybe this makes clang happier. Signed-off-by: Linus Torvalds (cherry picked from commit 6d53cefb18e4646fb4bf62ccb6098fb3808486df) --- include/linux/compiler-clang.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index ea9126006a69..d614c5ea1b5e 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -21,4 +21,5 @@ * -Wunused-function. This turns out to avoid the need for complex #ifdef * directives. Suppress the warning in clang as well. */ -#define inline inline __attribute__((unused)) +#undef inline +#define inline inline __attribute__((unused)) notrace From d89051f0fe6bc8bba2d6e273c8f9e45fbbea6e2b Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 13 Apr 2017 10:26:09 -0700 Subject: [PATCH 378/555] UPSTREAM: x86/kbuild: Use cc-option to enable -falign-{jumps/loops} clang currently does not support these optimizations, only enable them when they are available. Signed-off-by: Matthias Kaehlcke Cc: Greg Hackmann Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Michael Davidson Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: grundler@chromium.org Link: http://lkml.kernel.org/r/20170413172609.118122-1-mka@chromium.org Signed-off-by: Ingo Molnar (cherry picked from commit 2c4fd1ac3ff167c91272dc43c7bfd2269ef61557) --- arch/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 75725dc4df7a..af0735ec2ef8 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -88,10 +88,10 @@ else KBUILD_CFLAGS += -m64 # Align jump targets to 1 byte, not the default 16 bytes: - KBUILD_CFLAGS += -falign-jumps=1 + KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1) # Pack loops tightly as well: - KBUILD_CFLAGS += -falign-loops=1 + KBUILD_CFLAGS += $(call cc-option,-falign-loops=1) # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) From 19e1bc3d3b1ac206402f5c03476d5355c0f00024 Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Wed, 15 Mar 2017 15:36:00 -0700 Subject: [PATCH 379/555] UPSTREAM: crypto, x86: aesni - fix token pasting for clang aes_ctrby8_avx-x86_64.S uses the C preprocessor for token pasting of character sequences that are not valid preprocessor tokens. While this is allowed when preprocessing assembler files it exposes an incompatibilty between the clang and gcc preprocessors where clang does not strip leading white space from macro parameters, leading to the CONCAT(%xmm, i) macro expansion on line 96 resulting in a token with a space character embedded in it. While this could be resolved by deleting the offending space character, the assembler is perfectly capable of doing the token pasting correctly for itself so we can just get rid of the preprocessor macros. Signed-off-by: Michael Davidson Signed-off-by: Herbert Xu (cherry picked from commit fdb2726f4e61c5e3abc052f547d5a5f6c0dc5504) --- arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index a916c4a61165..5f6a5af9c489 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -65,7 +65,6 @@ #include #include -#define CONCAT(a,b) a##b #define VMOVDQ vmovdqu #define xdata0 %xmm0 @@ -92,8 +91,6 @@ #define num_bytes %r8 #define tmp %r10 -#define DDQ(i) CONCAT(ddq_add_,i) -#define XMM(i) CONCAT(%xmm, i) #define DDQ_DATA 0 #define XDATA 1 #define KEY_128 1 @@ -131,12 +128,12 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ .macro setddq n - var_ddq_add = DDQ(\n) + var_ddq_add = ddq_add_\n .endm /* generate a unique variable for xmm register */ .macro setxdata n - var_xdata = XMM(\n) + var_xdata = %xmm\n .endm /* club the numeric 'id' to the symbol 'name' */ From a79b53cc043bd76038097706bfdd21c8af161c44 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 1 May 2017 15:47:41 -0700 Subject: [PATCH 380/555] UPSTREAM: x86/mm/kaslr: Use the _ASM_MUL macro for multiplication to work around Clang incompatibility The constraint "rm" allows the compiler to put mix_const into memory. When the input operand is a memory location then MUL needs an operand size suffix, since Clang can't infer the multiplication width from the operand. Add and use the _ASM_MUL macro which determines the operand size and resolves to the NUL instruction with the corresponding suffix. This fixes the following error when building with clang: CC arch/x86/lib/kaslr.o /tmp/kaslr-dfe1ad.s: Assembler messages: /tmp/kaslr-dfe1ad.s:182: Error: no instruction mnemonic suffix given and no register operands; can't size instruction Signed-off-by: Matthias Kaehlcke Cc: Grant Grundler Cc: Greg Hackmann Cc: Kees Cook Cc: Linus Torvalds Cc: Michael Davidson Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170501224741.133938-1-mka@chromium.org Signed-off-by: Ingo Molnar (cherry picked from commit 121843eb02a6e2fa30aefab64bfe183c97230c75) --- arch/x86/include/asm/asm.h | 1 + arch/x86/lib/kaslr.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7acb51c49fec..7a9df3beb89b 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -32,6 +32,7 @@ #define _ASM_ADD __ASM_SIZE(add) #define _ASM_SUB __ASM_SIZE(sub) #define _ASM_XADD __ASM_SIZE(xadd) +#define _ASM_MUL __ASM_SIZE(mul) #define _ASM_AX __ASM_REG(ax) #define _ASM_BX __ASM_REG(bx) diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index 121f59c6ee54..0c7fe444dcdd 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -5,6 +5,7 @@ * kernel starts. This file is included in the compressed kernel and * normally linked in the regular. */ +#include #include #include #include @@ -79,7 +80,7 @@ unsigned long kaslr_get_random_long(const char *purpose) } /* Circular multiply for better bit diffusion */ - asm("mul %3" + asm(_ASM_MUL "%3" : "=a" (random), "=d" (raw) : "a" (random), "rm" (mix_const)); random += raw; From 00eea2fac0720f51db40c6beb37aa9e4baa39802 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 21 Jun 2017 16:28:03 -0700 Subject: [PATCH 381/555] UPSTREAM: kbuild: Add __cc-option macro cc-option uses KBUILD_CFLAGS and KBUILD_CPPFLAGS when it determines whether an option is supported or not. This is fine for options used to build the kernel itself, however some components like the x86 boot code use a different set of flags. Add the new macro __cc-option which is a more generic version of cc-option with additional parameters. One parameter is the compiler with which the check should be performed, the other the compiler options to be used instead KBUILD_C*FLAGS. Refactor cc-option and hostcc-option to use __cc-option and move hostcc-option to scripts/Kbuild.include. Suggested-by: Arnd Bergmann Suggested-by: Masahiro Yamada Signed-off-by: Matthias Kaehlcke Acked-by: Arnd Bergmann Acked-by: Michal Marek Signed-off-by: Masahiro Yamada (cherry picked from commit 9f3f1fd299768782465cb32cdf0dd4528d11f26b) --- Makefile | 2 +- scripts/Kbuild.include | 14 ++++++++++++-- scripts/Makefile.host | 6 ------ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 300f01c80c11..b0375795e953 100644 --- a/Makefile +++ b/Makefile @@ -301,7 +301,7 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ HOSTCC = gcc HOSTCXX = g++ -HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 +HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 HOSTCXXFLAGS = -O2 ifeq ($(shell $(HOSTCC) -v 2>&1 | grep -c "clang version"), 1) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 715838093ba1..140a0fa24f84 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -108,6 +108,11 @@ as-option = $(call try-run,\ as-instr = $(call try-run,\ printf "%b\n" "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3)) +# __cc-option +# Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586) +__cc-option = $(call try-run,\ + $(1) -Werror $(2) $(3) -c -x c /dev/null -o "$$TMP",$(3),$(4)) + # Do not attempt to build with gcc plugins during cc-option tests. # (And this uses delayed resolution so the flags will be up to date.) CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) @@ -115,8 +120,13 @@ CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) # cc-option # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) -cc-option = $(call try-run,\ - $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) +cc-option = $(call __cc-option, $(CC),\ + $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS),$(1),$(2)) + +# hostcc-option +# Usage: cflags-y += $(call hostcc-option,-march=winchip-c6,-march=i586) +hostcc-option = $(call __cc-option, $(HOSTCC),\ + $(HOSTCFLAGS) $(HOST_EXTRACFLAGS),$(1),$(2)) # cc-option-yn # Usage: flag := $(call cc-option-yn,-march=winchip-c6) diff --git a/scripts/Makefile.host b/scripts/Makefile.host index 45b5b1aaedbd..9cfd5c84d76f 100644 --- a/scripts/Makefile.host +++ b/scripts/Makefile.host @@ -20,12 +20,6 @@ # Will compile qconf as a C++ program, and menu as a C program. # They are linked as C++ code to the executable qconf -# hostcc-option -# Usage: cflags-y += $(call hostcc-option,-march=winchip-c6,-march=i586) - -hostcc-option = $(call try-run,\ - $(HOSTCC) $(HOSTCFLAGS) $(HOST_EXTRACFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) - __hostprogs := $(sort $(hostprogs-y) $(hostprogs-m)) host-cshlib := $(sort $(hostlibs-y) $(hostlibs-m)) host-cxxshlib := $(sort $(hostcxxlibs-y) $(hostcxxlibs-m)) From a736f347ef2231ac32a6065ff6066cc5da5e8aa9 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 21 Jun 2017 16:28:04 -0700 Subject: [PATCH 382/555] UPSTREAM: x86/build: Use __cc-option for boot code compiler options cc-option is used to enable compiler options for the boot code if they are available. The macro uses KBUILD_CFLAGS and KBUILD_CPPFLAGS for the check, however these flags aren't used to build the boot code, in consequence cc-option can yield wrong results. For example -mpreferred-stack-boundary=2 is never set with a 64-bit compiler, since the setting is only valid for 16 and 32-bit binaries. This is also the case for 32-bit kernel builds, because the option -m32 is added to KBUILD_CFLAGS after the assignment of REALMODE_CFLAGS. Use __cc-option instead of cc-option for the boot mode options. The macro receives the compiler options as parameter instead of using KBUILD_C*FLAGS, for the boot code we pass REALMODE_CFLAGS. Also use separate statements for the __cc-option checks instead of performing them in the initial assignment of REALMODE_CFLAGS since the variable is an input of the macro. Signed-off-by: Matthias Kaehlcke Acked-by: Ingo Molnar Signed-off-by: Masahiro Yamada (cherry picked from commit 032a2c4f65a2f81c93e161a11197ba19bc14a909) --- arch/x86/Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index af0735ec2ef8..1019233148b9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -24,10 +24,11 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) + -mno-mmx -mno-sse + +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -mpreferred-stack-boundary=2) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit From cc7bf5b9693203cbd9b64d89ae18b44dc7fb3c5f Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 21 Jun 2017 16:28:05 -0700 Subject: [PATCH 383/555] UPSTREAM: x86/build: Specify stack alignment for clang For gcc stack alignment is configured with -mpreferred-stack-boundary=N, clang has the option -mstack-alignment=N for that purpose. Use the same alignment as with gcc. If the alignment is not specified clang assumes an alignment of 16 bytes, as required by the standard ABI. However as mentioned in d9b0cde91c60 ("x86-64, gcc: Use -mpreferred-stack-boundary=3 if supported") the standard kernel entry on x86-64 leaves the stack on an 8-byte boundary, as a consequence clang will keep the stack misaligned. Signed-off-by: Matthias Kaehlcke Acked-by: Ingo Molnar Signed-off-by: Masahiro Yamada (cherry picked from commit d77698df39a512911586834d303275ea5fda74d0) Change-Id: If47392e9cb5cfbb18e06be4867a5b4903aa37301 --- arch/x86/Makefile | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1019233148b9..7192e539a5ad 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -11,6 +11,14 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif +# For gcc stack alignment is specified with -mpreferred-stack-boundary, +# clang has the option -mstack-alignment for that purpose. +ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) + cc_stack_align_opt := -mpreferred-stack-boundary +else ifneq ($(call cc-option, -mstack-alignment=4),) + cc_stack_align_opt := -mstack-alignment +endif + # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. # @@ -28,7 +36,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -mpreferred-stack-boundary=2) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -65,8 +73,10 @@ ifeq ($(CONFIG_X86_32),y) # with nonstandard options KBUILD_CFLAGS += -fno-pic - # prevent gcc from keeping the stack 16 byte aligned - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) + # Align the stack to the register width instead of using the default + # alignment of 16 bytes. This reduces stack usage and the number of + # alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -100,8 +110,14 @@ else KBUILD_CFLAGS += -fno-pic - # Use -mpreferred-stack-boundary=3 if supported. - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) + # By default gcc and clang use a stack alignment of 16 bytes for x86. + # However the standard kernel entry on x86-64 leaves the stack on an + # 8-byte boundary. If the compiler isn't informed about the actual + # alignment it will generate extra alignment instructions for the + # default alignment which keep the stack *mis*aligned. + # Furthermore an alignment to the register width reduces stack usage + # and the number of alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) From fc7c5de49793f9ba8ed0a936d074b3deba2f57cb Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Fri, 21 Apr 2017 14:39:30 -0700 Subject: [PATCH 384/555] UPSTREAM: kbuild: clang: Disable 'address-of-packed-member' warning clang generates plenty of these warnings in different parts of the code, to an extent that the warnings are little more than noise. Disable the 'address-of-packed-member' warning. Signed-off-by: Matthias Kaehlcke Reviewed-by: Douglas Anderson Signed-off-by: Masahiro Yamada (cherry picked from commit bfb38988c51e440fd7062ddf3157f7d8b1dd5d70) --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index b0375795e953..9fab6a0a5db0 100644 --- a/Makefile +++ b/Makefile @@ -713,6 +713,7 @@ KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) # Quiet clang warning: comparison of unsigned expression < 0 is always false KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the From e2b2ca94b0e101b3f24d82a1934151f29b738121 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 26 Apr 2017 17:11:32 +0100 Subject: [PATCH 385/555] UPSTREAM: crypto: arm64/sha - avoid non-standard inline asm tricks Replace the inline asm which exports struct offsets as ELF symbols with proper const variables exposing the same values. This works around an issue with Clang which does not interpret the "i" (or "I") constraints in the same way as GCC. Signed-off-by: Ard Biesheuvel Tested-by: Matthias Kaehlcke Signed-off-by: Herbert Xu (cherry picked from commit f4857f4c2ee9aa4e2aacac1a845352b00197fb57) --- arch/arm64/crypto/sha1-ce-core.S | 6 ++++-- arch/arm64/crypto/sha1-ce-glue.c | 11 +++-------- arch/arm64/crypto/sha2-ce-core.S | 6 ++++-- arch/arm64/crypto/sha2-ce-glue.c | 13 +++++-------- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index c98e7e849f06..8550408735a0 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -82,7 +82,8 @@ ENTRY(sha1_ce_transform) ldr dgb, [x0, #16] /* load sha1_ce_state::finalize */ - ldr w4, [x0, #:lo12:sha1_ce_offsetof_finalize] + ldr_l w4, sha1_ce_offsetof_finalize, x4 + ldr w4, [x0, x4] /* load input */ 0: ld1 {v8.4s-v11.4s}, [x1], #64 @@ -132,7 +133,8 @@ CPU_LE( rev32 v11.16b, v11.16b ) * the padding is handled by the C code in that case. */ cbz x4, 3f - ldr x4, [x0, #:lo12:sha1_ce_offsetof_count] + ldr_l w4, sha1_ce_offsetof_count, x4 + ldr x4, [x0, x4] movi v9.2d, #0 mov x8, #0x80000000 movi v10.2d, #0 diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index aefda9868627..ea319c055f5d 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -17,9 +17,6 @@ #include #include -#define ASM_EXPORT(sym, val) \ - asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val)); - MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); @@ -32,6 +29,9 @@ struct sha1_ce_state { asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, int blocks); +const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); +const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize); + static int sha1_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { @@ -52,11 +52,6 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, struct sha1_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); - ASM_EXPORT(sha1_ce_offsetof_count, - offsetof(struct sha1_ce_state, sst.count)); - ASM_EXPORT(sha1_ce_offsetof_finalize, - offsetof(struct sha1_ce_state, finalize)); - /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index 01cfee066837..679c6c002f4f 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -88,7 +88,8 @@ ENTRY(sha2_ce_transform) ld1 {dgav.4s, dgbv.4s}, [x0] /* load sha256_ce_state::finalize */ - ldr w4, [x0, #:lo12:sha256_ce_offsetof_finalize] + ldr_l w4, sha256_ce_offsetof_finalize, x4 + ldr w4, [x0, x4] /* load input */ 0: ld1 {v16.4s-v19.4s}, [x1], #64 @@ -136,7 +137,8 @@ CPU_LE( rev32 v19.16b, v19.16b ) * the padding is handled by the C code in that case. */ cbz x4, 3f - ldr x4, [x0, #:lo12:sha256_ce_offsetof_count] + ldr_l w4, sha256_ce_offsetof_count, x4 + ldr x4, [x0, x4] movi v17.2d, #0 mov x8, #0x80000000 movi v18.2d, #0 diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 7cd587564a41..0ed9486f75dd 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -17,9 +17,6 @@ #include #include -#define ASM_EXPORT(sym, val) \ - asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val)); - MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); @@ -32,6 +29,11 @@ struct sha256_ce_state { asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, int blocks); +const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, + sst.count); +const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state, + finalize); + static int sha256_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { @@ -52,11 +54,6 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, struct sha256_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); - ASM_EXPORT(sha256_ce_offsetof_count, - offsetof(struct sha256_ce_state, sst.count)); - ASM_EXPORT(sha256_ce_offsetof_finalize, - offsetof(struct sha256_ce_state, finalize)); - /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. From ff0a112436e16a2e73a9d227ae2ab52dc0362c30 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 19 Jul 2017 20:27:30 +0200 Subject: [PATCH 386/555] UPSTREAM: llist: clang: introduce member_address_is_nonnull() Currently llist_for_each_entry() and llist_for_each_entry_safe() iterate until &pos->member != NULL. But when building the kernel with Clang, the compiler assumes &pos->member cannot be NULL if the member's offset is greater than 0 (which would be equivalent to the object being non-contiguous in memory). Therefore the loop condition is always true, and the loops become infinite. To work around this, introduce the member_address_is_nonnull() macro, which casts object pointer to uintptr_t, thus letting the member pointer to be NULL. Signed-off-by: Alexander Potapenko Tested-by: Sodagudi Prasad Signed-off-by: Linus Torvalds (cherry picked from commit beaec533fc2701a28a4d667f67c9f59c6e4e0d13) --- include/linux/llist.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/include/linux/llist.h b/include/linux/llist.h index fd4ca0b4fe0f..ac6796138ba0 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -87,6 +87,23 @@ static inline void init_llist_head(struct llist_head *list) #define llist_entry(ptr, type, member) \ container_of(ptr, type, member) +/** + * member_address_is_nonnull - check whether the member address is not NULL + * @ptr: the object pointer (struct type * that contains the llist_node) + * @member: the name of the llist_node within the struct. + * + * This macro is conceptually the same as + * &ptr->member != NULL + * but it works around the fact that compilers can decide that taking a member + * address is never a NULL pointer. + * + * Real objects that start at a high address and have a member at NULL are + * unlikely to exist, but such pointers may be returned e.g. by the + * container_of() macro. + */ +#define member_address_is_nonnull(ptr, member) \ + ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0) + /** * llist_for_each - iterate over some deleted entries of a lock-less list * @pos: the &struct llist_node to use as a loop cursor @@ -121,7 +138,7 @@ static inline void init_llist_head(struct llist_head *list) */ #define llist_for_each_entry(pos, node, member) \ for ((pos) = llist_entry((node), typeof(*(pos)), member); \ - &(pos)->member != NULL; \ + member_address_is_nonnull(pos, member); \ (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) /** @@ -143,7 +160,7 @@ static inline void init_llist_head(struct llist_head *list) */ #define llist_for_each_entry_safe(pos, n, node, member) \ for (pos = llist_entry((node), typeof(*pos), member); \ - &pos->member != NULL && \ + member_address_is_nonnull(pos, member) && \ (n = llist_entry(pos->member.next, typeof(*n), member), true); \ pos = n) From 6b763a229af3363c4c7cfd8ccf3bbc33e935529e Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Mon, 24 Jul 2017 16:51:55 -0700 Subject: [PATCH 387/555] UPSTREAM: x86/boot: #undef memcpy() et al in string.c undef memcpy() and friends in boot/string.c so that the functions defined here will have the correct names, otherwise we end up up trying to redefine __builtin_memcpy() etc. Surprisingly, GCC allows this (and, helpfully, discards the __builtin_ prefix from the function name when compiling it), but clang does not. Adding these #undef's appears to preserve what I assume was the original intent of the code. (cherry picked from commit 18d5e6c34a8eda438d5ad8b3b15f42dab01bf05d) Signed-off-by: Michael Davidson Signed-off-by: Matthias Kaehlcke Acked-by: H. Peter Anvin Cc: Arnd Bergmann Cc: Bernhard.Rosenkranzer@linaro.org Cc: Greg Hackmann Cc: Kees Cook Cc: Linus Torvalds Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170724235155.79255-1-mka@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/string.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 9e240fcba784..08dfce02362c 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -16,6 +16,15 @@ #include "ctype.h" #include "string.h" +/* + * Undef these macros so that the functions that we provide + * here will have the correct names regardless of how string.h + * may have chosen to #define them. + */ +#undef memcpy +#undef memset +#undef memcmp + int memcmp(const void *s1, const void *s2, size_t len) { bool diff; From bb41d8f7424cab21d7919dc07647af6191401ea3 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 6 Jul 2017 15:35:24 -0700 Subject: [PATCH 388/555] UPSTREAM: compiler, clang: always inline when CONFIG_OPTIMIZE_INLINING is disabled The motivation for commit abb2ea7dfd82 ("compiler, clang: suppress warning for unused static inline functions") was to suppress clang's warnings about unused static inline functions. For configs without CONFIG_OPTIMIZE_INLINING enabled, such as any non-x86 architecture, `inline' in the kernel implies that __attribute__((always_inline)) is used. Some code depends on that behavior, see https://lkml.org/lkml/2017/6/13/918: net/built-in.o: In function `__xchg_mb': arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99' arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99 The full fix would be to identify these breakages and annotate the functions with __always_inline instead of `inline'. But since we are late in the 4.12-rc cycle, simply carry forward the forced inlining behavior and work toward moving arm64, and other architectures, toward CONFIG_OPTIMIZE_INLINING behavior. (cherry picked from commit 9a04dbcfb33b4012d0ce8c0282f1e3ca694675b1) Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1706261552200.1075@chino.kir.corp.google.com Signed-off-by: David Rientjes Reported-by: Sodagudi Prasad Tested-by: Sodagudi Prasad Tested-by: Matthias Kaehlcke Cc: Mark Rutland Cc: Will Deacon Cc: Catalin Marinas Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 8 -------- include/linux/compiler-gcc.h | 18 +++++++++++------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d614c5ea1b5e..de179993e039 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -15,11 +15,3 @@ * with any version that can compile the kernel */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -/* - * GCC does not warn about unused static inline functions for - * -Wunused-function. This turns out to avoid the need for complex #ifdef - * directives. Suppress the warning in clang as well. - */ -#undef inline -#define inline inline __attribute__((unused)) notrace diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 928e5ca0caee..1659f98984fc 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -66,18 +66,22 @@ /* * Force always-inline if the user requests it so via the .config, - * or if gcc is too old: + * or if gcc is too old. + * GCC does not warn about unused static inline functions for + * -Wunused-function. This turns out to avoid the need for complex #ifdef + * directives. Suppress the warning in clang as well by using "unused" + * function attribute, which is redundant but not harmful for gcc. */ #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -#define inline inline __attribute__((always_inline)) notrace -#define __inline__ __inline__ __attribute__((always_inline)) notrace -#define __inline __inline __attribute__((always_inline)) notrace +#define inline inline __attribute__((always_inline,unused)) notrace +#define __inline__ __inline__ __attribute__((always_inline,unused)) notrace +#define __inline __inline __attribute__((always_inline,unused)) notrace #else /* A lot of inline functions can cause havoc with function tracing */ -#define inline inline notrace -#define __inline__ __inline__ notrace -#define __inline __inline notrace +#define inline inline __attribute__((unused)) notrace +#define __inline__ __inline__ __attribute__((unused)) notrace +#define __inline __inline __attribute__((unused)) notrace #endif #define __always_inline inline __attribute__((always_inline)) From 0e38949af667f2fd04a7959c0473a1c6b8a4a4d0 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 16 Aug 2017 17:47:40 -0700 Subject: [PATCH 389/555] UPSTREAM: x86/build: Fix stack alignment for CLang Commit: d77698df39a5 ("x86/build: Specify stack alignment for clang") intended to use the same stack alignment for clang as with gcc. The two compilers use different options to configure the stack alignment (gcc: -mpreferred-stack-boundary=n, clang: -mstack-alignment=n). The above commit assumes that the clang option uses the same parameter type as gcc, i.e. that the alignment is specified as 2^n. However clang interprets the value of this option literally to use an alignment of n, in consequence the stack remains misaligned. Change the values used with -mstack-alignment to be the actual alignment instead of a power of two. cc-option isn't used here with the typical pattern of KBUILD_CFLAGS += $(call cc-option ...). The reason is that older gcc versions don't support the -mpreferred-stack-boundary option, since cc-option doesn't verify whether the alternative option is valid it would incorrectly select the clang option -mstack-alignment.. Signed-off-by: Matthias Kaehlcke Cc: Arnd Bergmann Cc: Bernhard.Rosenkranzer@linaro.org Cc: Greg Hackmann Cc: Kees Cook Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Michael Davidson Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Stephen Hines Cc: Thomas Gleixner Cc: dianders@chromium.org Link: http://lkml.kernel.org/r/20170817004740.170588-1-mka@chromium.org Signed-off-by: Ingo Molnar (cherry picked from commit 8f91869766c00622b2eaa8ee567db4f333b78c1a) Change-Id: I1a58d7fe64101b59f5264fefcce4fc593c2ebf2b --- arch/x86/Makefile | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7192e539a5ad..1a366e49845d 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -14,9 +14,11 @@ endif # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) - cc_stack_align_opt := -mpreferred-stack-boundary -else ifneq ($(call cc-option, -mstack-alignment=4),) - cc_stack_align_opt := -mstack-alignment + cc_stack_align4 := -mpreferred-stack-boundary=2 + cc_stack_align8 := -mpreferred-stack-boundary=3 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align4 := -mstack-alignment=4 + cc_stack_align8 := -mstack-alignment=8 endif # How to compile the 16-bit code. Note we always compile for -march=i386; @@ -36,7 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) +REALMODE_CFLAGS += $(cc_stack_align4) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y) # Align the stack to the register width instead of using the default # alignment of 16 bytes. This reduces stack usage and the number of # alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) + KBUILD_CFLAGS += $(cc_stack_align4) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -117,7 +119,7 @@ else # default alignment which keep the stack *mis*aligned. # Furthermore an alignment to the register width reduces stack usage # and the number of alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) + KBUILD_CFLAGS += $(cc_stack_align8) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) From ae92169eb94faeb71c9061b14ae13b7ddac47c3c Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 17 Aug 2017 11:20:47 -0700 Subject: [PATCH 390/555] UPSTREAM: x86/build: Use cc-option to validate stack alignment parameter With the following commit: 8f91869766c0 ("x86/build: Fix stack alignment for CLang") cc-option is only used to determine the name of the stack alignment option supported by the compiler, but not to verify that the actual parameter