From 27adebe16fbcf7b4436bbcfe057113d1fecb4405 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:58:05 +0000 Subject: [PATCH 001/235] ANDROID: fixup! sched/fair: jump to max OPP when crossing UP threshold Signed-off-by: Juri Lelli [AmitP: cherry-picked from android-4.4] Signed-off-by: Amit Pundir --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5d6f8bfe4991..1bbae990cb48 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3247,6 +3247,7 @@ void scheduler_tick(void) walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, walt_ktime_clock(), 0); calc_global_load_tick(rq); + sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -3256,8 +3257,6 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); - - sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL From 3d89857e627ae21f5330ff331420299d66ef1b35 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 6 Dec 2016 11:50:53 +0000 Subject: [PATCH 002/235] ANDROID: sched/walt: kill {min,max}_capacity {min,max}_capacity are static variables that are only updated from __update_min_max_capacity(), but not used anywhere else. Remove them together with the function updating them. This has also the nice side effect of fixing a LOCKDEP warning related to locking all CPUs in update_min_max_capacity(), as reported by Ke Wang: [ 2.853595] c0 ============================================= [ 2.859219] c0 [ INFO: possible recursive locking detected ] [ 2.864852] c0 4.4.6+ #5 Tainted: G W [ 2.869604] c0 --------------------------------------------- [ 2.875230] c0 swapper/0/1 is trying to acquire lock: [ 2.880248] (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 2.888815] c0 [ 2.888815] c0 but task is already holding lock: [ 2.895132] (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 2.903700] c0 [ 2.903700] c0 other info that might help us debug this: [ 2.910710] c0 Possible unsafe locking scenario: [ 2.910710] c0 [ 2.917112] c0 CPU0 [ 2.919795] c0 ---- [ 2.922478] lock(&rq->lock); [ 2.925507] lock(&rq->lock); [ 2.928536] c0 [ 2.928536] c0 *** DEADLOCK *** [ 2.928536] c0 [ 2.935200] c0 May be due to missing lock nesting notation [ 2.935200] c0 [ 2.942471] c0 7 locks held by swapper/0/1: [ 2.946623] #0: (&dev->mutex){......}, at: [] __driver_attach+0x64/0xb8 [ 2.954931] #1: (&dev->mutex){......}, at: [] __driver_attach+0x74/0xb8 [ 2.963239] #2: (cpu_hotplug.lock){++++++}, at: [] get_online_cpus+0x48/0xa8 [ 2.971979] #3: (subsys mutex#6){+.+.+.}, at: [] subsys_interface_register+0x44/0xc0 [ 2.981411] #4: (&policy->rwsem){+.+.+.}, at: [] cpufreq_online+0x330/0x76c [ 2.990065] #5: ((cpufreq_policy_notifier_list).rwsem){.+.+..}, at: [] blocking_notifier_call_chain+0x38/0xc4 [ 3.001661] #6: (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 3.010661] c0 [ 3.010661] c0 stack backtrace: [ 3.015514] c0 CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.4.6+ #5 [ 3.022864] c0 Hardware name: Spreadtrum SP9860g Board (DT) [ 3.028402] c0 Call trace: [ 3.031092] c0 [] dump_backtrace+0x0/0x210 [ 3.036716] c0 [] show_stack+0x20/0x28 [ 3.041994] c0 [] dump_stack+0xa8/0xe0 [ 3.047273] c0 [] __lock_acquire+0x1e0c/0x2218 [ 3.053243] c0 [] lock_acquire+0xe0/0x280 [ 3.058784] c0 [] _raw_spin_lock+0x44/0x58 [ 3.064407] c0 [] cpufreq_notifier_policy+0x2e8/0x37c [ 3.070983] c0 [] blocking_notifier_call_chain+0x78/0xc4 [ 3.077820] c0 [] cpufreq_online+0x28c/0x76c [ 3.083618] c0 [] cpufreq_add_dev+0x98/0xdc [ 3.089331] c0 [] subsys_interface_register+0x84/0xc0 [ 3.095907] c0 [] cpufreq_register_driver+0x168/0x28c [ 3.102486] c0 [] sprd_cpufreq_probe+0x134/0x19c [ 3.108629] c0 [] platform_drv_probe+0x58/0xd0 [ 3.114599] c0 [] driver_probe_device+0x1e8/0x470 [ 3.120830] c0 [] __driver_attach+0xb4/0xb8 [ 3.126541] c0 [] bus_for_each_dev+0x6c/0xac [ 3.132339] c0 [] driver_attach+0x2c/0x34 [ 3.137877] c0 [] bus_add_driver+0x210/0x298 [ 3.143676] c0 [] driver_register+0x7c/0x114 [ 3.149476] c0 [] __platform_driver_register+0x60/0x6c [ 3.156139] c0 [] sprd_cpufreq_platdrv_init+0x18/0x20 [ 3.162714] c0 [] do_one_initcall+0xd0/0x1d8 [ 3.168514] c0 [] kernel_init_freeable+0x1fc/0x29c [ 3.174834] c0 [] kernel_init+0x20/0x12c [ 3.180281] c0 [] ret_from_fork+0x10/0x40 Reported-by: Ke Wang Signed-off-by: Juri Lelli [AmitP: cherry-picked from android-4.4] Signed-off-by: Amit Pundir --- kernel/sched/walt.c | 45 +-------------------------------------------- 1 file changed, 1 insertion(+), 44 deletions(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 1b4bb233d8b6..1d5e1d5cb13f 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -62,8 +62,6 @@ static unsigned int max_possible_freq = 1; */ static unsigned int min_max_freq = 1; -static unsigned int max_capacity = 1024; -static unsigned int min_capacity = 1024; static unsigned int max_load_scale_factor = 1024; static unsigned int max_possible_capacity = 1024; @@ -869,39 +867,6 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) double_rq_unlock(src_rq, dest_rq); } -/* Keep track of max/min capacity possible across CPUs "currently" */ -static void __update_min_max_capacity(void) -{ - int i; - int max = 0, min = INT_MAX; - - for_each_online_cpu(i) { - if (cpu_rq(i)->capacity > max) - max = cpu_rq(i)->capacity; - if (cpu_rq(i)->capacity < min) - min = cpu_rq(i)->capacity; - } - - max_capacity = max; - min_capacity = min; -} - -static void update_min_max_capacity(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - __update_min_max_capacity(); - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - local_irq_restore(flags); -} - /* * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that * least efficient cpu gets capacity of 1024 @@ -984,15 +949,9 @@ static int cpufreq_notifier_policy(struct notifier_block *nb, /* Initialized to policy->max in case policy->related_cpus is empty! */ unsigned int orig_max_freq = policy->max; - if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && - val != CPUFREQ_CREATE_POLICY) + if (val != CPUFREQ_NOTIFY) return 0; - if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { - update_min_max_capacity(); - return 0; - } - for_each_cpu(i, policy->related_cpus) { cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, policy->related_cpus); @@ -1082,8 +1041,6 @@ static int cpufreq_notifier_policy(struct notifier_block *nb, max_load_scale_factor = highest_mplsf; } - __update_min_max_capacity(); - return 0; } From 68f3fc9c64578f002265cb7b92e527af6286c262 Mon Sep 17 00:00:00 2001 From: Zhang Bo Date: Thu, 1 Jun 2017 14:21:58 +0800 Subject: [PATCH 003/235] ANDROID: cpufreq: system panic when switch interactive governor frequently When try to suspend and resume and playback video or audio at the same time, Power HAL will switch governor between conservative and interactive. Before cpufreq switch to conservative governor, it will stop interactive governor(in function: cpufreq_interactive_stop) and set "icpu->ipolicy = NULL;". If the cpufreq_interactive_speedchange_task doesn't exit and run at the same time, it try to get "policy = icpu->ipolicy->policy;". It will cause system panic. Need to check pointer validatation before read. Change-Id: I608969370738130c44b1816f073423d509a3bcd7 Signed-off-by: Zhang Bo Signed-off-by: Todd Poynor --- drivers/cpufreq/cpufreq_interactive.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index d6cac0e8e7a6..576be554d83d 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -570,13 +570,15 @@ again: for_each_cpu(cpu, &tmp_mask) { struct interactive_cpu *icpu = &per_cpu(interactive_cpu, cpu); - struct cpufreq_policy *policy = icpu->ipolicy->policy; + struct cpufreq_policy *policy; if (unlikely(!down_read_trylock(&icpu->enable_sem))) continue; - if (likely(icpu->ipolicy)) + if (likely(icpu->ipolicy)) { + policy = icpu->ipolicy->policy; cpufreq_interactive_adjust_cpu(cpu, policy); + } up_read(&icpu->enable_sem); } From 98350987ca497636833d15788bf107e2e3d0cc16 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 30 May 2017 14:46:26 -0700 Subject: [PATCH 004/235] ANDROID: hid: uhid: implement refcount for open and close Fix concurrent open and close activity sending a UHID_CLOSE while some consumers still have the device open. Temporary solution for reference counts on device open and close calls, absent a facility for this in the HID core likely to appear in the future. [toddpoynor@google.com: commit text] Bug: 38448648 Signed-off-by: Todd Poynor Signed-off-by: Dmitry Torokhov Change-Id: I57413e42ec961a960a8ddc4942228df22c730d80 --- drivers/hid/uhid.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index 7f8ff39ed44b..e46f65671dec 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -28,6 +28,8 @@ #define UHID_NAME "uhid" #define UHID_BUFSIZE 32 +static DEFINE_MUTEX(uhid_open_mutex); + struct uhid_device { struct mutex devlock; bool running; @@ -142,15 +144,26 @@ static void uhid_hid_stop(struct hid_device *hid) static int uhid_hid_open(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; + int retval = 0; - return uhid_queue_event(uhid, UHID_OPEN); + mutex_lock(&uhid_open_mutex); + if (!hid->open++) { + retval = uhid_queue_event(uhid, UHID_OPEN); + if (retval) + hid->open--; + } + mutex_unlock(&uhid_open_mutex); + return retval; } static void uhid_hid_close(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; - uhid_queue_event(uhid, UHID_CLOSE); + mutex_lock(&uhid_open_mutex); + if (!--hid->open) + uhid_queue_event(uhid, UHID_CLOSE); + mutex_unlock(&uhid_open_mutex); } static int uhid_hid_parse(struct hid_device *hid) From 6f115f5b6c294ed9d3cc25086da2ce420224f791 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Wed, 24 May 2017 10:28:27 +0800 Subject: [PATCH 005/235] ANDROID: Kconfig: add depends for UID_SYS_STATS uid_io depends on TASK_XACCT and TASK_IO_ACCOUNTING. So add depends in Kconfig before compiling code. Change-Id: Ie6bf57ec7c2eceffadf4da0fc2aca001ce10c36e Signed-off-by: Ganesh Mahendran --- drivers/misc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 52ca7a3a8b6b..8db69003f37c 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -768,7 +768,7 @@ config PANEL_BOOT_MESSAGE config UID_SYS_STATS bool "Per-UID statistics" - depends on PROFILING + depends on PROFILING && TASK_XACCT && TASK_IO_ACCOUNTING help Per UID based cpu time statistics exported to /proc/uid_cputime Per UID based io statistics exported to /proc/uid_io From a2f057876a9ced710d0ff066fdc663d2bfeb9fb7 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 11 May 2017 16:15:15 -0700 Subject: [PATCH 006/235] FROMLIST: f2fs: sanity check checkpoint segno and blkoff Make sure segno and blkoff read from raw image are valid. (url https://sourceforge.net/p/linux-f2fs/mailman/message/35835945) Signed-off-by: Jin Qian Bug: 36588520 Change-Id: Iba66ab97d3d0870ea48b5ef192d9075f225a934a --- fs/f2fs/super.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7e0c002c12e9..b81998e9e91d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1424,6 +1424,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1435,6 +1437,22 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + main_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) { + return 1; + } + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) { + return 1; + } + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 18bf691e44b0bdb0fea1eff3324c701f69c777c1 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 1 Jun 2017 11:18:30 -0700 Subject: [PATCH 007/235] FROMLIST: f2fs: sanity check size of nat and sit cache Make sure number of entires doesn't exceed max journal size. Cc: stable@vger.kernel.org Signed-off-by: Jin Qian Signed-off-by: Jaegeuk Kim (url https://sourceforge.net/p/linux-f2fs/mailman/message/35872032) Bug: 36819470 Change-Id: I94ad46eb3586cfcdd5f477a6caa2244b7d05da56 --- fs/f2fs/segment.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a7943f861d68..74a2b444406d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1805,6 +1805,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; @@ -1831,6 +1833,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } From 8f3ac4c1ba7da22794943ade0e51863c098cbbb0 Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Fri, 19 May 2017 17:20:18 -0400 Subject: [PATCH 008/235] ANDROID: Update init/do_mounts_dm.c to the latest ChromiumOS version. This is needed for AVB. Bug: None Test: Compiles. Change-Id: I45b5d435652ab66ec07420ab17f2c7889f7e4d95 Signed-off-by: David Zeuthen --- drivers/md/dm.h | 2 - include/linux/device-mapper.h | 7 + init/do_mounts_dm.c | 562 ++++++++++++++++++---------------- 3 files changed, 310 insertions(+), 261 deletions(-) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index f0aad08b9654..ed25f30a7550 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -80,8 +80,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type); unsigned dm_get_md_type(struct mapped_device *md); struct target_type *dm_get_immutable_target_type(struct mapped_device *md); -int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); - /* * To check the return value from dm_table_find_target(). */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 0e1e050bc574..cf86f528e615 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -440,6 +440,13 @@ union map_info *dm_get_rq_mapinfo(struct request *rq); struct queue_limits *dm_get_queue_limits(struct mapped_device *md); +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, unsigned type); +unsigned dm_get_md_type(struct mapped_device *md); +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); +unsigned dm_table_get_type(struct dm_table *t); + /* * Geometry functions. */ diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index a557c5ee00a7..af84b01ccfbc 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -5,13 +5,17 @@ * * This file is released under the GPL. */ +#include +#include #include #include #include +#include #include "do_mounts.h" -#include "../drivers/md/dm.h" +#define DM_MAX_DEVICES 256 +#define DM_MAX_TARGETS 256 #define DM_MAX_NAME 32 #define DM_MAX_UUID 129 #define DM_NO_UUID "none" @@ -19,14 +23,47 @@ #define DM_MSG_PREFIX "init" /* Separators used for parsing the dm= argument. */ -#define DM_FIELD_SEP ' ' -#define DM_LINE_SEP ',' +#define DM_FIELD_SEP " " +#define DM_LINE_SEP "," +#define DM_ANY_SEP DM_FIELD_SEP DM_LINE_SEP /* * When the device-mapper and any targets are compiled into the kernel - * (not a module), one target may be created and used as the root device at - * boot time with the parameters given with the boot line dm=... - * The code for that is here. + * (not a module), one or more device-mappers may be created and used + * as the root device at boot time with the parameters given with the + * boot line dm=... + * + * Multiple device-mappers can be stacked specifing the number of + * devices. A device can have multiple targets if the the number of + * targets is specified. + * + * TODO(taysom:defect 32847) + * In the future, the field will be mandatory. + * + * ::= [] + + * ::= "," + + * ::= [] + * ::= "," + * ::= "ro" | "rw" + * ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none" + * ::= "verity" | "bootcache" | ... + * + * Example: + * 2 vboot none ro 1, + * 0 1768000 bootcache + * device=aa55b119-2a47-8c45-946a-5ac57765011f+1 + * signature=76e9be054b15884a9fa85973e9cb274c93afadb6 + * cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000, + * vroot none ro 1, + * 0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1 + * root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6 + * salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe + * + * Notes: + * 1. uuid is a label for the device and we set it to "none". + * 2. The field will be optional initially and assumed to be 1. + * Once all the scripts that set these fields have been set, it will + * be made mandatory. */ struct dm_setup_target { @@ -38,381 +75,388 @@ struct dm_setup_target { struct dm_setup_target *next; }; -static struct { +struct dm_device { int minor; int ro; char name[DM_MAX_NAME]; char uuid[DM_MAX_UUID]; - char *targets; + unsigned long num_targets; struct dm_setup_target *target; int target_count; + struct dm_device *next; +}; + +struct dm_option { + char *start; + char *next; + size_t len; + char delim; +}; + +static struct { + unsigned long num_devices; + char *str; } dm_setup_args __initdata; static __initdata int dm_early_setup; -static size_t __init get_dm_option(char *str, char **next, char sep) +static int __init get_dm_option(struct dm_option *opt, const char *accept) { - size_t len = 0; - char *endp = NULL; + char *str = opt->next; + char *endp; if (!str) return 0; - endp = strchr(str, sep); + str = skip_spaces(str); + opt->start = str; + endp = strpbrk(str, accept); if (!endp) { /* act like strchrnul */ - len = strlen(str); - endp = str + len; + opt->len = strlen(str); + endp = str + opt->len; } else { - len = endp - str; + opt->len = endp - str; } - - if (endp == str) - return 0; - - if (!next) - return len; - + opt->delim = *endp; if (*endp == 0) { /* Don't advance past the nul. */ - *next = endp; + opt->next = endp; } else { - *next = endp + 1; + opt->next = endp + 1; } - return len; + return opt->len != 0; } -static int __init dm_setup_args_init(void) +static int __init dm_setup_cleanup(struct dm_device *devices) { - dm_setup_args.minor = 0; - dm_setup_args.ro = 0; - dm_setup_args.target = NULL; - dm_setup_args.target_count = 0; + struct dm_device *dev = devices; + + while (dev) { + struct dm_device *old_dev = dev; + struct dm_setup_target *target = dev->target; + while (target) { + struct dm_setup_target *old_target = target; + kfree(target->type); + kfree(target->params); + target = target->next; + kfree(old_target); + dev->target_count--; + } + BUG_ON(dev->target_count); + dev = dev->next; + kfree(old_dev); + } return 0; } -static int __init dm_setup_cleanup(void) +static char * __init dm_parse_device(struct dm_device *dev, char *str) { - struct dm_setup_target *target = dm_setup_args.target; - struct dm_setup_target *old_target = NULL; - while (target) { - kfree(target->type); - kfree(target->params); - old_target = target; - target = target->next; - kfree(old_target); - dm_setup_args.target_count--; - } - BUG_ON(dm_setup_args.target_count); - return 0; -} - -static char * __init dm_setup_parse_device_args(char *str) -{ - char *next = NULL; - size_t len = 0; + struct dm_option opt; + size_t len; /* Grab the logical name of the device to be exported to udev */ - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len) { + opt.next = str; + if (!get_dm_option(&opt, DM_FIELD_SEP)) { DMERR("failed to parse device name"); goto parse_fail; } - len = min(len + 1, sizeof(dm_setup_args.name)); - strlcpy(dm_setup_args.name, str, len); /* includes nul */ - str = skip_spaces(next); + len = min(opt.len + 1, sizeof(dev->name)); + strlcpy(dev->name, opt.start, len); /* includes nul */ /* Grab the UUID value or "none" */ - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len) { + if (!get_dm_option(&opt, DM_FIELD_SEP)) { DMERR("failed to parse device uuid"); goto parse_fail; } - len = min(len + 1, sizeof(dm_setup_args.uuid)); - strlcpy(dm_setup_args.uuid, str, len); - str = skip_spaces(next); + len = min(opt.len + 1, sizeof(dev->uuid)); + strlcpy(dev->uuid, opt.start, len); /* Determine if the table/device will be read only or read-write */ - if (!strncmp("ro,", str, 3)) { - dm_setup_args.ro = 1; - } else if (!strncmp("rw,", str, 3)) { - dm_setup_args.ro = 0; + get_dm_option(&opt, DM_ANY_SEP); + if (!strncmp("ro", opt.start, opt.len)) { + dev->ro = 1; + } else if (!strncmp("rw", opt.start, opt.len)) { + dev->ro = 0; } else { DMERR("failed to parse table mode"); goto parse_fail; } - str = skip_spaces(str + 3); - return str; + /* Optional number field */ + /* XXX: The field will be mandatory in the next round */ + if (opt.delim == DM_FIELD_SEP[0]) { + if (!get_dm_option(&opt, DM_LINE_SEP)) + return NULL; + dev->num_targets = simple_strtoul(opt.start, NULL, 10); + } else { + dev->num_targets = 1; + } + if (dev->num_targets > DM_MAX_TARGETS) { + DMERR("too many targets %lu > %d", + dev->num_targets, DM_MAX_TARGETS); + } + return opt.next; parse_fail: return NULL; } -static void __init dm_substitute_devices(char *str, size_t str_len) +static char * __init dm_parse_targets(struct dm_device *dev, char *str) { - char *candidate = str; - char *candidate_end = str; - char old_char; - size_t len = 0; - dev_t dev; - - if (str_len < 3) - return; - - while (str && *str) { - candidate = strchr(str, '/'); - if (!candidate) - break; - - /* Avoid embedded slashes */ - if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) { - str = strchr(candidate, DM_FIELD_SEP); - continue; - } - - len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP); - str = skip_spaces(candidate_end); - if (len < 3 || len > 37) /* name_to_dev_t max; maj:mix min */ - continue; - - /* Temporarily terminate with a nul */ - if (*candidate_end) - candidate_end--; - old_char = *candidate_end; - *candidate_end = '\0'; - - DMDEBUG("converting candidate device '%s' to dev_t", candidate); - /* Use the boot-time specific device naming */ - dev = name_to_dev_t(candidate); - *candidate_end = old_char; - - DMDEBUG(" -> %u", dev); - /* No suitable replacement found */ - if (!dev) - continue; - - /* Rewrite the /dev/path as a major:minor */ - len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev)); - if (!len) { - DMERR("error substituting device major/minor."); - break; - } - candidate += len; - /* Pad out with spaces (fixing our nul) */ - while (candidate < candidate_end) - *(candidate++) = DM_FIELD_SEP; - } -} - -static int __init dm_setup_parse_targets(char *str) -{ - char *next = NULL; - size_t len = 0; - struct dm_setup_target **target = NULL; + struct dm_option opt; + struct dm_setup_target **target = &dev->target; + unsigned long num_targets = dev->num_targets; + unsigned long i; /* Targets are defined as per the table format but with a * comma as a newline separator. */ - target = &dm_setup_args.target; - while (str && *str) { + opt.next = str; + for (i = 0; i < num_targets; i++) { *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL); if (!*target) { - DMERR("failed to allocate memory for target %d", - dm_setup_args.target_count); + DMERR("failed to allocate memory for target %s<%ld>", + dev->name, i); goto parse_fail; } - dm_setup_args.target_count++; + dev->target_count++; - (*target)->begin = simple_strtoull(str, &next, 10); - if (!next || *next != DM_FIELD_SEP) { - DMERR("failed to parse starting sector for target %d", - dm_setup_args.target_count - 1); + if (!get_dm_option(&opt, DM_FIELD_SEP)) { + DMERR("failed to parse starting sector" + " for target %s<%ld>", dev->name, i); goto parse_fail; } - str = skip_spaces(next + 1); + (*target)->begin = simple_strtoull(opt.start, NULL, 10); - (*target)->length = simple_strtoull(str, &next, 10); - if (!next || *next != DM_FIELD_SEP) { - DMERR("failed to parse length for target %d", - dm_setup_args.target_count - 1); + if (!get_dm_option(&opt, DM_FIELD_SEP)) { + DMERR("failed to parse length for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next + 1); + (*target)->length = simple_strtoull(opt.start, NULL, 10); - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len || - !((*target)->type = kstrndup(str, len, GFP_KERNEL))) { - DMERR("failed to parse type for target %d", - dm_setup_args.target_count - 1); + if (get_dm_option(&opt, DM_FIELD_SEP)) + (*target)->type = kstrndup(opt.start, opt.len, + GFP_KERNEL); + if (!((*target)->type)) { + DMERR("failed to parse type for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next); - - len = get_dm_option(str, &next, DM_LINE_SEP); - if (!len || - !((*target)->params = kstrndup(str, len, GFP_KERNEL))) { - DMERR("failed to parse params for target %d", - dm_setup_args.target_count - 1); + if (get_dm_option(&opt, DM_LINE_SEP)) + (*target)->params = kstrndup(opt.start, opt.len, + GFP_KERNEL); + if (!((*target)->params)) { + DMERR("failed to parse params for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next); - - /* Before moving on, walk through the copied target and - * attempt to replace all /dev/xxx with the major:minor number. - * It may not be possible to resolve them traditionally at - * boot-time. */ - dm_substitute_devices((*target)->params, len); - target = &((*target)->next); } - DMDEBUG("parsed %d targets", dm_setup_args.target_count); + DMDEBUG("parsed %d targets", dev->target_count); - return 0; + return opt.next; parse_fail: - return 1; + return NULL; +} + +static struct dm_device * __init dm_parse_args(void) +{ + struct dm_device *devices = NULL; + struct dm_device **tail = &devices; + struct dm_device *dev; + char *str = dm_setup_args.str; + unsigned long num_devices = dm_setup_args.num_devices; + unsigned long i; + + if (!str) + return NULL; + for (i = 0; i < num_devices; i++) { + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + DMERR("failed to allocated memory for dev"); + goto error; + } + *tail = dev; + tail = &dev->next; + /* + * devices are given minor numbers 0 - n-1 + * in the order they are found in the arg + * string. + */ + dev->minor = i; + str = dm_parse_device(dev, str); + if (!str) /* NULL indicates error in parsing, bail */ + goto error; + + str = dm_parse_targets(dev, str); + if (!str) + goto error; + } + return devices; +error: + dm_setup_cleanup(devices); + return NULL; } /* * Parse the command-line parameters given our kernel, but do not * actually try to invoke the DM device now; that is handled by - * dm_setup_drive after the low-level disk drivers have initialised. - * dm format is as follows: - * dm="name uuid fmode,[table line 1],[table line 2],..." - * May be used with root=/dev/dm-0 as it always uses the first dm minor. + * dm_setup_drives after the low-level disk drivers have initialised. + * dm format is described at the top of the file. + * + * Because dm minor numbers are assigned in assending order starting with 0, + * You can assume the first device is /dev/dm-0, the next device is /dev/dm-1, + * and so forth. */ - static int __init dm_setup(char *str) { - dm_setup_args_init(); + struct dm_option opt; + unsigned long num_devices; - str = dm_setup_parse_device_args(str); if (!str) { DMDEBUG("str is NULL"); goto parse_fail; } - - /* Target parsing is delayed until we have dynamic memory */ - dm_setup_args.targets = str; - - printk(KERN_INFO "dm: will configure '%s' on dm-%d\n", - dm_setup_args.name, dm_setup_args.minor); - + opt.next = str; + if (!get_dm_option(&opt, DM_FIELD_SEP)) + goto parse_fail; + if (isdigit(opt.start[0])) { /* XXX: Optional number field */ + num_devices = simple_strtoul(opt.start, NULL, 10); + str = opt.next; + } else { + num_devices = 1; + /* Don't advance str */ + } + if (num_devices > DM_MAX_DEVICES) { + DMDEBUG("too many devices %lu > %d", + num_devices, DM_MAX_DEVICES); + } + dm_setup_args.str = str; + dm_setup_args.num_devices = num_devices; + DMINFO("will configure %lu devices", num_devices); dm_early_setup = 1; return 1; parse_fail: - printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n"); + DMWARN("Invalid arguments supplied to dm=."); return 0; } - -static void __init dm_setup_drive(void) +static void __init dm_setup_drives(void) { struct mapped_device *md = NULL; struct dm_table *table = NULL; struct dm_setup_target *target; - char *uuid = dm_setup_args.uuid; + struct dm_device *dev; + char *uuid; fmode_t fmode = FMODE_READ; + struct dm_device *devices; - /* Finish parsing the targets. */ - if (dm_setup_parse_targets(dm_setup_args.targets)) - goto parse_fail; + devices = dm_parse_args(); - if (dm_create(dm_setup_args.minor, &md)) { - DMDEBUG("failed to create the device"); - goto dm_create_fail; - } - DMDEBUG("created device '%s'", dm_device_name(md)); - - /* In addition to flagging the table below, the disk must be - * set explicitly ro/rw. */ - set_disk_ro(dm_disk(md), dm_setup_args.ro); - - if (!dm_setup_args.ro) - fmode |= FMODE_WRITE; - if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) { - DMDEBUG("failed to create the table"); - goto dm_table_create_fail; - } - - dm_lock_md_type(md); - target = dm_setup_args.target; - while (target) { - DMINFO("adding target '%llu %llu %s %s'", - (unsigned long long) target->begin, - (unsigned long long) target->length, target->type, - target->params); - if (dm_table_add_target(table, target->type, target->begin, - target->length, target->params)) { - DMDEBUG("failed to add the target to the table"); - goto add_target_fail; + for (dev = devices; dev; dev = dev->next) { + if (dm_create(dev->minor, &md)) { + DMDEBUG("failed to create the device"); + goto dm_create_fail; } - target = target->next; - } + DMDEBUG("created device '%s'", dm_device_name(md)); - if (dm_table_complete(table)) { - DMDEBUG("failed to complete the table"); - goto table_complete_fail; - } + /* + * In addition to flagging the table below, the disk must be + * set explicitly ro/rw. + */ + set_disk_ro(dm_disk(md), dev->ro); - if (dm_get_md_type(md) == DM_TYPE_NONE) { + if (!dev->ro) + fmode |= FMODE_WRITE; + if (dm_table_create(&table, fmode, dev->target_count, md)) { + DMDEBUG("failed to create the table"); + goto dm_table_create_fail; + } + + dm_lock_md_type(md); + + for (target = dev->target; target; target = target->next) { + DMINFO("adding target '%llu %llu %s %s'", + (unsigned long long) target->begin, + (unsigned long long) target->length, + target->type, target->params); + if (dm_table_add_target(table, target->type, + target->begin, + target->length, + target->params)) { + DMDEBUG("failed to add the target" + " to the table"); + goto add_target_fail; + } + } + if (dm_table_complete(table)) { + DMDEBUG("failed to complete the table"); + goto table_complete_fail; + } + + /* Suspend the device so that we can bind it to the table. */ + if (dm_suspend(md, 0)) { + DMDEBUG("failed to suspend the device pre-bind"); + goto suspend_fail; + } + + /* Initial table load: acquire type of table. */ dm_set_md_type(md, dm_table_get_type(table)); + + /* Setup md->queue to reflect md's type. */ if (dm_setup_md_queue(md, table)) { DMWARN("unable to set up device queue for new table."); goto setup_md_queue_fail; } - } else if (dm_get_md_type(md) != dm_table_get_type(table)) { - DMWARN("can't change device type after initial table load."); - goto setup_md_queue_fail; - } - /* Suspend the device so that we can bind it to the table. */ - if (dm_suspend(md, 0)) { - DMDEBUG("failed to suspend the device pre-bind"); - goto suspend_fail; + /* + * Bind the table to the device. This is the only way + * to associate md->map with the table and set the disk + * capacity directly. + */ + if (dm_swap_table(md, table)) { /* should return NULL. */ + DMDEBUG("failed to bind the device to the table"); + goto table_bind_fail; + } + + /* Finally, resume and the device should be ready. */ + if (dm_resume(md)) { + DMDEBUG("failed to resume the device"); + goto resume_fail; + } + + /* Export the dm device via the ioctl interface */ + if (!strcmp(DM_NO_UUID, dev->uuid)) + uuid = NULL; + if (dm_ioctl_export(md, dev->name, uuid)) { + DMDEBUG("failed to export device with given" + " name and uuid"); + goto export_fail; + } + + dm_unlock_md_type(md); + + DMINFO("dm-%d is ready", dev->minor); } - - /* Bind the table to the device. This is the only way to associate - * md->map with the table and set the disk capacity directly. */ - if (dm_swap_table(md, table)) { /* should return NULL. */ - DMDEBUG("failed to bind the device to the table"); - goto table_bind_fail; - } - - /* Finally, resume and the device should be ready. */ - if (dm_resume(md)) { - DMDEBUG("failed to resume the device"); - goto resume_fail; - } - - /* Export the dm device via the ioctl interface */ - if (!strcmp(DM_NO_UUID, dm_setup_args.uuid)) - uuid = NULL; - if (dm_ioctl_export(md, dm_setup_args.name, uuid)) { - DMDEBUG("failed to export device with given name and uuid"); - goto export_fail; - } - printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); - - dm_unlock_md_type(md); - dm_setup_cleanup(); + dm_setup_cleanup(devices); return; export_fail: resume_fail: table_bind_fail: -suspend_fail: setup_md_queue_fail: +suspend_fail: table_complete_fail: add_target_fail: dm_unlock_md_type(md); dm_table_create_fail: dm_put(md); dm_create_fail: - dm_setup_cleanup(); -parse_fail: - printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n", - dm_setup_args.minor, dm_setup_args.name); + DMWARN("starting dm-%d (%s) failed", + dev->minor, dev->name); + dm_setup_cleanup(devices); } __setup("dm=", dm_setup); @@ -421,6 +465,6 @@ void __init dm_run_setup(void) { if (!dm_early_setup) return; - printk(KERN_INFO "dm: attempting early device configuration.\n"); - dm_setup_drive(); + DMINFO("attempting early device configuration."); + dm_setup_drives(); } From 9e3c6555594531637581a7f6cdb4a5ad6bd9071f Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Fri, 2 Jun 2017 17:04:59 -0700 Subject: [PATCH 009/235] FROMLIST: [net-next,v2,1/2] bpf: Allow CGROUP_SKB eBPF program to access sk_buff This allows cgroup eBPF program to classify packet based on their protocol or other detail information. Currently program need CAP_NET_ADMIN privilege to attach a cgroup eBPF program, and A process with CAP_NET_ADMIN can already see all packets on the system, for example, by creating an iptables rules that causes the packet to be passed to userspace via NFLOG. (url: http://patchwork.ozlabs.org/patch/769459/) Signed-off-by: Chenbo Feng Bug: 30950746 Change-Id: I11bef84ce26cf8b8f1b89483c32a7fcdd61ae926 --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 44c17f47d94c..fe158bd01dc6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2164,6 +2164,7 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; From b9aad9765754fac26434ec6e99e7c8b943da63e3 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Fri, 2 Jun 2017 17:24:31 -0700 Subject: [PATCH 010/235] FROMLIST: [net-next,v2,2/2] bpf: Remove the capability check for cgroup skb eBPF program Currently loading a cgroup skb eBPF program require a CAP_SYS_ADMIN capability while attaching the program to a cgroup only requires the user have CAP_NET_ADMIN privilege. We can escape the capability check when load the program just like socket filter program to make the capability requirement consistent. Change since v1: Change the code style in order to be compliant with checkpatch.pl preference (url: http://patchwork.ozlabs.org/patch/769460/) Signed-off-by: Chenbo Feng Bug: 30950746 Change-Id: Ibe51235127d6f9349b8f563ad31effc061b278ed --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9ab04796bce0..5e668da0feb3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -747,7 +747,9 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + if (type != BPF_PROG_TYPE_SOCKET_FILTER && + type != BPF_PROG_TYPE_CGROUP_SKB && + !capable(CAP_SYS_ADMIN)) return -EPERM; /* plain bpf_prog allocation */ From 71773989febf5c3ac9b424843973080ab3e86f0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 7 Nov 2016 16:16:15 -0500 Subject: [PATCH 011/235] UPSTREAM: drm/ttm: fix ttm_bo_wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit reservation_object_wait_timeout_rcu() should enable signaling even with a zero timeout, but ttm_bo_wait() can also be called from atomic context and then it is not a good idea to do this. Reviewed-by: Alex Deucher Signed-off-by: Christian König Reviewed-by: Gustavo Padovan Signed-off-by: Alex Deucher Signed-off-by: Sumit Semwal [sumits: fix checkpatch warnings] Link: http://patchwork.freedesktop.org/patch/msgid/1478553376-18575-3-git-send-email-alexander.deucher@amd.com (cherry picked from commit 98a6dd909bbd247ce10f36ce709906bc5e9dfeb0) Change-Id: I142f7a853c9f6b16567531d7c332ba9f99325ec2 Signed-off-by: Marissa Wall --- drivers/gpu/drm/ttm/ttm_bo.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 35cc16f9fec9..c18fc31bf51a 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1602,7 +1602,14 @@ EXPORT_SYMBOL(ttm_bo_unmap_virtual); int ttm_bo_wait(struct ttm_buffer_object *bo, bool interruptible, bool no_wait) { - long timeout = no_wait ? 0 : 15 * HZ; + long timeout = 15 * HZ; + + if (no_wait) { + if (reservation_object_test_signaled_rcu(bo->resv, true)) + return 0; + else + return -EBUSY; + } timeout = reservation_object_wait_timeout_rcu(bo->resv, true, interruptible, timeout); From 7827566a5ff2c7ad1451981adf57d28d8ff46ca4 Mon Sep 17 00:00:00 2001 From: Roberto Pereira Date: Mon, 5 Jun 2017 15:30:30 -0700 Subject: [PATCH 012/235] android: base-cfg: disable CONFIG_NFSD and CONFIG_NFS_FS Signed-off-by: Roberto Pereira Bug:37753761 Change-Id: I686c6ff59a1754b78cf11d302621920c9a0580de --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index bc615c62810b..fb6017e1a869 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -3,6 +3,8 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set +# CONFIG_NFSD is not set +# CONFIG_NFS_FS is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set From 4eed44029507acc666ac7afe9c6a8ea0abf857b7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2017 06:29:19 -0700 Subject: [PATCH 013/235] dccp/tcp: do not inherit mc_list from parent [ Upstream commit 657831ffc38e30092a2d5f03d385d710eb88b09a ] syzkaller found a way to trigger double frees from ip_mc_drop_socket() It turns out that leave a copy of parent mc_list at accept() time, which is very bad. Very similar to commit 8b485ce69876 ("tcp: do not inherit fastopen_req from parent") Initial report from Pray3r, completed by Andrey one. Thanks a lot to them ! Signed-off-by: Eric Dumazet Reported-by: Pray3r Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/inet_connection_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 61a9deec2993..cf3d5674846a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -665,6 +665,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); + inet_sk(newsk)->mc_list = NULL; + newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); From 8f1f08be397469edf297e2686efc5c70190880d0 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 9 May 2017 18:27:33 +0800 Subject: [PATCH 014/235] driver: vrf: Fix one possible use-after-free issue [ Upstream commit 1a4a5bf52a4adb477adb075e5afce925824ad132 ] The current codes only deal with the case that the skb is dropped, it may meet one use-after-free issue when NF_HOOK returns 0 that means the skb is stolen by one netfilter rule or hook. When one netfilter rule or hook stoles the skb and return NF_STOLEN, it means the skb is taken by the rule, and other modules should not touch this skb ever. Maybe the skb is queued or freed directly by the rule. Now uses the nf_hook instead of NF_HOOK to get the result of netfilter, and check the return value of nf_hook. Only when its value equals 1, it means the skb could go ahead. Or reset the skb as NULL. BTW, because vrf_rcv_finish is empty function, so needn't invoke it even though nf_hook returns 1. But we need to modify vrf_rcv_finish to deal with the NF_STOLEN case. There are two cases when skb is stolen. 1. The skb is stolen and freed directly. There is nothing we need to do, and vrf_rcv_finish isn't invoked. 2. The skb is queued and reinjected again. The vrf_rcv_finish would be invoked as okfn, so need to free the skb in it. Signed-off-by: Gao Feng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/vrf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 80ef4865cc8b..ee02605a0f89 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -850,6 +850,7 @@ static u32 vrf_fib_table(const struct net_device *dev) static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + kfree_skb(skb); return 0; } @@ -859,7 +860,7 @@ static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, { struct net *net = dev_net(dev); - if (NF_HOOK(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) < 0) + if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; From 4bd8f5e38e5a1612ce4373068b518b14d3e38ec8 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 9 May 2017 16:59:54 -0700 Subject: [PATCH 015/235] ipv6/dccp: do not inherit ipv6_mc_list from parent [ Upstream commit 83eaddab4378db256d00d295bda6ca997cd13a52 ] Like commit 657831ffc38e ("dccp/tcp: do not inherit mc_list from parent") we should clear ipv6_mc_list etc. for IPv6 sockets too. Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/ipv6.c | 6 ++++++ net/ipv6/tcp_ipv6.c | 2 ++ 2 files changed, 8 insertions(+) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 237d62c493e3..2ac9d2a1aaab 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -426,6 +426,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, newsk->sk_backlog_rcv = dccp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; @@ -490,6 +493,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b2e61a0e8d0a..aef9b28067f4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1046,6 +1046,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif + newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; @@ -1115,6 +1116,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * First: no IPv4 options. */ newinet->inet_opt = NULL; + newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; From 25c1a1e4d891fbe49bf9c1da6ba0cf5e1f21231b Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 10 May 2017 19:07:51 +0200 Subject: [PATCH 016/235] s390/qeth: handle sysfs error during initialization [ Upstream commit 9111e7880ccf419548c7b0887df020b08eadb075 ] When setting up the device from within the layer discipline's probe routine, creating the layer-specific sysfs attributes can fail. Report this error back to the caller, and handle it by releasing the layer discipline. Signed-off-by: Ursula Braun [jwi: updated commit msg, moved an OSN change to a subsequent patch] Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_core_main.c | 4 +++- drivers/s390/net/qeth_core_sys.c | 2 ++ drivers/s390/net/qeth_l2_main.c | 5 ++++- drivers/s390/net/qeth_l3_main.c | 5 ++++- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 20cf29613043..4bf1f60c407d 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -5663,8 +5663,10 @@ static int qeth_core_set_online(struct ccwgroup_device *gdev) if (rc) goto err; rc = card->discipline->setup(card->gdev); - if (rc) + if (rc) { + qeth_core_free_discipline(card); goto err; + } } rc = card->discipline->set_online(gdev); err: diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c index 75b29fd2fcf4..412ff61891ac 100644 --- a/drivers/s390/net/qeth_core_sys.c +++ b/drivers/s390/net/qeth_core_sys.c @@ -426,6 +426,8 @@ static ssize_t qeth_dev_layer2_store(struct device *dev, goto out; rc = card->discipline->setup(card->gdev); + if (rc) + qeth_core_free_discipline(card); out: mutex_unlock(&card->discipline_mutex); return rc ? rc : count; diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index bb27058fa9f0..73260a43b675 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1024,8 +1024,11 @@ static int qeth_l2_stop(struct net_device *dev) static int qeth_l2_probe_device(struct ccwgroup_device *gdev) { struct qeth_card *card = dev_get_drvdata(&gdev->dev); + int rc; - qeth_l2_create_device_attributes(&gdev->dev); + rc = qeth_l2_create_device_attributes(&gdev->dev); + if (rc) + return rc; INIT_LIST_HEAD(&card->vid_list); hash_init(card->mac_htable); card->options.layer2 = 1; diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 272d9e7419be..93fa3316ec55 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3157,8 +3157,11 @@ static int qeth_l3_setup_netdev(struct qeth_card *card) static int qeth_l3_probe_device(struct ccwgroup_device *gdev) { struct qeth_card *card = dev_get_drvdata(&gdev->dev); + int rc; - qeth_l3_create_device_attributes(&gdev->dev); + rc = qeth_l3_create_device_attributes(&gdev->dev); + if (rc) + return rc; card->options.layer2 = 0; card->info.hwtrap = 0; return 0; From b68c2e387a232a9fc09ccbed792649481ca226f4 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 10 May 2017 19:07:52 +0200 Subject: [PATCH 017/235] s390/qeth: unbreak OSM and OSN support [ Upstream commit 2d2ebb3ed0c6acfb014f98e427298673a5d07b82 ] commit b4d72c08b358 ("qeth: bridgeport support - basic control") broke the support for OSM and OSN devices as follows: As OSM and OSN are L2 only, qeth_core_probe_device() does an early setup by loading the l2 discipline and calling qeth_l2_probe_device(). In this context, adding the l2-specific bridgeport sysfs attributes via qeth_l2_create_device_attributes() hits a BUG_ON in fs/sysfs/group.c, since the basic sysfs infrastructure for the device hasn't been established yet. Note that OSN actually has its own unique sysfs attributes (qeth_osn_devtype), so the additional attributes shouldn't be created at all. For OSM, add a new qeth_l2_devtype that contains all the common and l2-specific sysfs attributes. When qeth_core_probe_device() does early setup for OSM or OSN, assign the corresponding devtype so that the ccwgroup probe code creates the full set of sysfs attributes. This allows us to skip qeth_l2_create_device_attributes() in case of an early setup. Any device that can't do early setup will initially have only the generic sysfs attributes, and when it's probed later qeth_l2_probe_device() adds the l2-specific attributes. If an early-setup device is removed (by calling ccwgroup_ungroup()), device_unregister() will - using the devtype - delete the l2-specific attributes before qeth_l2_remove_device() is called. So make sure to not remove them twice. What complicates the issue is that qeth_l2_probe_device() and qeth_l2_remove_device() is also called on a device when its layer2 attribute changes (ie. its layer mode is switched). For early-setup devices this wouldn't work properly - we wouldn't remove the l2-specific attributes when switching to L3. But switching the layer mode doesn't actually make any sense; we already decided that the device can only operate in L2! So just refuse to switch the layer mode on such devices. Note that OSN doesn't have a layer2 attribute, so we only need to special-case OSM. Based on an initial patch by Ursula Braun. Fixes: b4d72c08b358 ("qeth: bridgeport support - basic control") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_core.h | 4 ++++ drivers/s390/net/qeth_core_main.c | 17 +++++++++-------- drivers/s390/net/qeth_core_sys.c | 22 ++++++++++++++-------- drivers/s390/net/qeth_l2.h | 2 ++ drivers/s390/net/qeth_l2_main.c | 17 +++++++++++++---- drivers/s390/net/qeth_l2_sys.c | 8 ++++++++ drivers/s390/net/qeth_l3_main.c | 1 + 7 files changed, 51 insertions(+), 20 deletions(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 6d4b68c483f3..f3756ca6f349 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -718,6 +718,7 @@ enum qeth_discipline_id { }; struct qeth_discipline { + const struct device_type *devtype; void (*start_poll)(struct ccw_device *, int, unsigned long); qdio_handler_t *input_handler; qdio_handler_t *output_handler; @@ -893,6 +894,9 @@ extern struct qeth_discipline qeth_l2_discipline; extern struct qeth_discipline qeth_l3_discipline; extern const struct attribute_group *qeth_generic_attr_groups[]; extern const struct attribute_group *qeth_osn_attr_groups[]; +extern const struct attribute_group qeth_device_attr_group; +extern const struct attribute_group qeth_device_blkt_group; +extern const struct device_type qeth_generic_devtype; extern struct workqueue_struct *qeth_wq; int qeth_card_hw_is_reachable(struct qeth_card *); diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 4bf1f60c407d..e8c48309ebe9 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -5462,10 +5462,12 @@ void qeth_core_free_discipline(struct qeth_card *card) card->discipline = NULL; } -static const struct device_type qeth_generic_devtype = { +const struct device_type qeth_generic_devtype = { .name = "qeth_generic", .groups = qeth_generic_attr_groups, }; +EXPORT_SYMBOL_GPL(qeth_generic_devtype); + static const struct device_type qeth_osn_devtype = { .name = "qeth_osn", .groups = qeth_osn_attr_groups, @@ -5591,23 +5593,22 @@ static int qeth_core_probe_device(struct ccwgroup_device *gdev) goto err_card; } - if (card->info.type == QETH_CARD_TYPE_OSN) - gdev->dev.type = &qeth_osn_devtype; - else - gdev->dev.type = &qeth_generic_devtype; - switch (card->info.type) { case QETH_CARD_TYPE_OSN: case QETH_CARD_TYPE_OSM: rc = qeth_core_load_discipline(card, QETH_DISCIPLINE_LAYER2); if (rc) goto err_card; + + gdev->dev.type = (card->info.type != QETH_CARD_TYPE_OSN) + ? card->discipline->devtype + : &qeth_osn_devtype; rc = card->discipline->setup(card->gdev); if (rc) goto err_disc; - case QETH_CARD_TYPE_OSD: - case QETH_CARD_TYPE_OSX: + break; default: + gdev->dev.type = &qeth_generic_devtype; break; } diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c index 412ff61891ac..db6a285d41e0 100644 --- a/drivers/s390/net/qeth_core_sys.c +++ b/drivers/s390/net/qeth_core_sys.c @@ -413,12 +413,16 @@ static ssize_t qeth_dev_layer2_store(struct device *dev, if (card->options.layer2 == newdis) goto out; - else { - card->info.mac_bits = 0; - if (card->discipline) { - card->discipline->remove(card->gdev); - qeth_core_free_discipline(card); - } + if (card->info.type == QETH_CARD_TYPE_OSM) { + /* fixed layer, can't switch */ + rc = -EOPNOTSUPP; + goto out; + } + + card->info.mac_bits = 0; + if (card->discipline) { + card->discipline->remove(card->gdev); + qeth_core_free_discipline(card); } rc = qeth_core_load_discipline(card, newdis); @@ -705,10 +709,11 @@ static struct attribute *qeth_blkt_device_attrs[] = { &dev_attr_inter_jumbo.attr, NULL, }; -static struct attribute_group qeth_device_blkt_group = { +const struct attribute_group qeth_device_blkt_group = { .name = "blkt", .attrs = qeth_blkt_device_attrs, }; +EXPORT_SYMBOL_GPL(qeth_device_blkt_group); static struct attribute *qeth_device_attrs[] = { &dev_attr_state.attr, @@ -728,9 +733,10 @@ static struct attribute *qeth_device_attrs[] = { &dev_attr_switch_attrs.attr, NULL, }; -static struct attribute_group qeth_device_attr_group = { +const struct attribute_group qeth_device_attr_group = { .attrs = qeth_device_attrs, }; +EXPORT_SYMBOL_GPL(qeth_device_attr_group); const struct attribute_group *qeth_generic_attr_groups[] = { &qeth_device_attr_group, diff --git a/drivers/s390/net/qeth_l2.h b/drivers/s390/net/qeth_l2.h index 29d9fb3890ad..0d59f9a45ea9 100644 --- a/drivers/s390/net/qeth_l2.h +++ b/drivers/s390/net/qeth_l2.h @@ -8,6 +8,8 @@ #include "qeth_core.h" +extern const struct attribute_group *qeth_l2_attr_groups[]; + int qeth_l2_create_device_attributes(struct device *); void qeth_l2_remove_device_attributes(struct device *); void qeth_l2_setup_bridgeport_attrs(struct qeth_card *card); diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 73260a43b675..cdb03403b7a2 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1021,14 +1021,21 @@ static int qeth_l2_stop(struct net_device *dev) return 0; } +static const struct device_type qeth_l2_devtype = { + .name = "qeth_layer2", + .groups = qeth_l2_attr_groups, +}; + static int qeth_l2_probe_device(struct ccwgroup_device *gdev) { struct qeth_card *card = dev_get_drvdata(&gdev->dev); int rc; - rc = qeth_l2_create_device_attributes(&gdev->dev); - if (rc) - return rc; + if (gdev->dev.type == &qeth_generic_devtype) { + rc = qeth_l2_create_device_attributes(&gdev->dev); + if (rc) + return rc; + } INIT_LIST_HEAD(&card->vid_list); hash_init(card->mac_htable); card->options.layer2 = 1; @@ -1040,7 +1047,8 @@ static void qeth_l2_remove_device(struct ccwgroup_device *cgdev) { struct qeth_card *card = dev_get_drvdata(&cgdev->dev); - qeth_l2_remove_device_attributes(&cgdev->dev); + if (cgdev->dev.type == &qeth_generic_devtype) + qeth_l2_remove_device_attributes(&cgdev->dev); qeth_set_allowed_threads(card, 0, 1); wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); @@ -1437,6 +1445,7 @@ static int qeth_l2_control_event(struct qeth_card *card, } struct qeth_discipline qeth_l2_discipline = { + .devtype = &qeth_l2_devtype, .start_poll = qeth_qdio_start_poll, .input_handler = (qdio_handler_t *) qeth_qdio_input_handler, .output_handler = (qdio_handler_t *) qeth_qdio_output_handler, diff --git a/drivers/s390/net/qeth_l2_sys.c b/drivers/s390/net/qeth_l2_sys.c index 692db49e3d2a..a48ed9e7e168 100644 --- a/drivers/s390/net/qeth_l2_sys.c +++ b/drivers/s390/net/qeth_l2_sys.c @@ -272,3 +272,11 @@ void qeth_l2_setup_bridgeport_attrs(struct qeth_card *card) } else qeth_bridgeport_an_set(card, 0); } + +const struct attribute_group *qeth_l2_attr_groups[] = { + &qeth_device_attr_group, + &qeth_device_blkt_group, + /* l2 specific, see l2_{create,remove}_device_attributes(): */ + &qeth_l2_bridgeport_attr_group, + NULL, +}; diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 93fa3316ec55..ad4ea581d906 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3453,6 +3453,7 @@ static int qeth_l3_control_event(struct qeth_card *card, } struct qeth_discipline qeth_l3_discipline = { + .devtype = &qeth_generic_devtype, .start_poll = qeth_qdio_start_poll, .input_handler = (qdio_handler_t *) qeth_qdio_input_handler, .output_handler = (qdio_handler_t *) qeth_qdio_output_handler, From 96a81eb6ad5ab062dea32c6e764518effa5891e1 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 10 May 2017 19:07:53 +0200 Subject: [PATCH 018/235] s390/qeth: avoid null pointer dereference on OSN [ Upstream commit 25e2c341e7818a394da9abc403716278ee646014 ] Access card->dev only after checking whether's its valid. Signed-off-by: Julian Wiedmann Reviewed-by: Ursula Braun Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_l2_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index cdb03403b7a2..5d010aa89852 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1106,7 +1106,6 @@ static int qeth_l2_setup_netdev(struct qeth_card *card) case QETH_CARD_TYPE_OSN: card->dev = alloc_netdev(0, "osn%d", NET_NAME_UNKNOWN, ether_setup); - card->dev->flags |= IFF_NOARP; break; default: card->dev = alloc_etherdev(0); @@ -1119,9 +1118,12 @@ static int qeth_l2_setup_netdev(struct qeth_card *card) card->dev->watchdog_timeo = QETH_TX_TIMEOUT; card->dev->mtu = card->info.initial_mtu; card->dev->netdev_ops = &qeth_l2_netdev_ops; - card->dev->ethtool_ops = - (card->info.type != QETH_CARD_TYPE_OSN) ? - &qeth_l2_ethtool_ops : &qeth_l2_osn_ops; + if (card->info.type == QETH_CARD_TYPE_OSN) { + card->dev->ethtool_ops = &qeth_l2_osn_ops; + card->dev->flags |= IFF_NOARP; + } else { + card->dev->ethtool_ops = &qeth_l2_ethtool_ops; + } card->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; if (card->info.type == QETH_CARD_TYPE_OSD && !card->info.guestlan) { card->dev->hw_features = NETIF_F_SG; From c1f3f197d65018134e65f8f4df62562b2d0a7a65 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 10 May 2017 19:07:54 +0200 Subject: [PATCH 019/235] s390/qeth: add missing hash table initializations [ Upstream commit ebccc7397e4a49ff64c8f44a54895de9d32fe742 ] commit 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") added new hash tables, but missed to initialize them. Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") Signed-off-by: Ursula Braun Reviewed-by: Julian Wiedmann Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_l3_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index ad4ea581d906..171be5ec2ece 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3162,6 +3162,8 @@ static int qeth_l3_probe_device(struct ccwgroup_device *gdev) rc = qeth_l3_create_device_attributes(&gdev->dev); if (rc) return rc; + hash_init(card->ip_htable); + hash_init(card->ip_mc_htable); card->options.layer2 = 0; card->info.hwtrap = 0; return 0; From 21e3113298f97ec95622c0359be62145ffd055c8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 11 May 2017 01:53:15 +0200 Subject: [PATCH 020/235] bpf, arm64: fix faulty emission of map access in tail calls [ Upstream commit d8b54110ee944de522ccd3531191f39986ec20f9 ] Shubham was recently asking on netdev why in arm64 JIT we don't multiply the index for accessing the tail call map by 8. That led me into testing out arm64 JIT wrt tail calls and it turned out I got a NULL pointer dereference on the tail call. The buggy access is at: prog = array->ptrs[index]; if (prog == NULL) goto out; [...] 00000060: d2800e0a mov x10, #0x70 // #112 00000064: f86a682a ldr x10, [x1,x10] 00000068: f862694b ldr x11, [x10,x2] 0000006c: b40000ab cbz x11, 0x00000080 [...] The code triggering the crash is f862694b. x1 at the time contains the address of the bpf array, x10 offsetof(struct bpf_array, ptrs). Meaning, above we load the pointer to the program at map slot 0 into x10. x10 can then be NULL if the slot is not occupied, which we later on try to access with a user given offset in x2 that is the map index. Fix this by emitting the following instead: [...] 00000060: d2800e0a mov x10, #0x70 // #112 00000064: 8b0a002a add x10, x1, x10 00000068: d37df04b lsl x11, x2, #3 0000006c: f86b694b ldr x11, [x10,x11] 00000070: b40000ab cbz x11, 0x00000084 [...] This basically adds the offset to ptrs to the base address of the bpf array we got and we later on access the map with an index * 8 offset relative to that. The tail call map itself is basically one large area with meta data at the head followed by the array of prog pointers. This makes tail calls working again, tested on Cavium ThunderX ARMv8. Fixes: ddb55992b04d ("arm64: bpf: implement bpf_tail_call() helper") Reported-by: Shubham Bansal Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/arm64/net/bpf_jit_comp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 9c4b57a7b265..d8199e12fb6e 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -252,8 +252,9 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) */ off = offsetof(struct bpf_array, ptrs); emit_a64_mov_i64(tmp, off, ctx); - emit(A64_LDR64(tmp, r2, tmp), ctx); - emit(A64_LDR64(prg, tmp, r3), ctx); + emit(A64_ADD(1, tmp, r2, tmp), ctx); + emit(A64_LSL(1, prg, r3, 3), ctx); + emit(A64_LDR64(prg, tmp, prg), ctx); emit(A64_CBZ(1, prg, jmp_offset), ctx); /* goto *(prog->bpf_func + prologue_size); */ From 5d165daafc4438b89670b2e77ef8f8df906f6308 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 May 2017 15:24:41 -0700 Subject: [PATCH 021/235] netem: fix skb_orphan_partial() [ Upstream commit f6ba8d33cfbb46df569972e64dbb5bb7e929bfd9 ] I should have known that lowering skb->truesize was dangerous :/ In case packets are not leaving the host via a standard Ethernet device, but looped back to local sockets, bad things can happen, as reported by Michael Madsen ( https://bugzilla.kernel.org/show_bug.cgi?id=195713 ) So instead of tweaking skb->truesize, lets change skb->destructor and keep a reference on the owner socket via its sk_refcnt. Fixes: f2f872f9272a ("netem: Introduce skb_orphan_partial() helper") Signed-off-by: Eric Dumazet Reported-by: Michael Madsen Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/sock.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 470a2043b846..03dcfc581fb4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1687,28 +1687,24 @@ EXPORT_SYMBOL(skb_set_owner_w); * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers - * rely on it (sch_fq for example). So we set skb->truesize to a small - * amount (1) and decrease sk_wmem_alloc accordingly. + * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { - /* If this skb is a TCP pure ACK or already went here, - * we have nothing to do. 2 is already a very small truesize. - */ - if (skb->truesize <= 2) + if (skb_is_tcp_pure_ack(skb)) return; - /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, - * so we do not completely orphan skb, but transfert all - * accounted bytes but one, to avoid unexpected reorders. - */ if (skb->destructor == sock_wfree #ifdef CONFIG_INET || skb->destructor == tcp_wfree #endif ) { - atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); - skb->truesize = 1; + struct sock *sk = skb->sk; + + if (atomic_inc_not_zero(&sk->sk_refcnt)) { + atomic_sub(skb->truesize, &sk->sk_wmem_alloc); + skb->destructor = sock_efree; + } } else { skb_orphan(skb); } From a5db124dc2a4819aefb71f4bcb1a825c7818734c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 13:27:53 -0700 Subject: [PATCH 022/235] net: fix compile error in skb_orphan_partial() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9142e9007f2d7ab58a587a1e1d921b0064a339aa ] If CONFIG_INET is not set, net/core/sock.c can not compile : net/core/sock.c: In function ‘skb_orphan_partial’: net/core/sock.c:1810:2: error: implicit declaration of function ‘skb_is_tcp_pure_ack’ [-Werror=implicit-function-declaration] if (skb_is_tcp_pure_ack(skb)) ^ Fix this by always including Fixes: f6ba8d33cfbb ("netem: fix skb_orphan_partial()") Signed-off-by: Eric Dumazet Reported-by: Paul Gortmaker Reported-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/sock.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 03dcfc581fb4..1989b3dd6d17 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -138,10 +138,7 @@ #include -#ifdef CONFIG_INET #include -#endif - #include static DEFINE_MUTEX(proto_list_mutex); From 8d625242e86b2e9e05b2cfd82e87aee7b351a0a7 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 10 May 2017 17:01:27 -0700 Subject: [PATCH 023/235] tcp: avoid fragmenting peculiar skbs in SACK [ Upstream commit b451e5d24ba6687c6f0e7319c727a709a1846c06 ] This patch fixes a bug in splitting an SKB during SACK processing. Specifically if an skb contains multiple packets and is only partially sacked in the higher sequences, tcp_match_sack_to_skb() splits the skb and marks the second fragment as SACKed. The current code further attempts rounding up the first fragment to MSS boundaries. But it misses a boundary condition when the rounded-up fragment size (pkt_len) is exactly skb size. Spliting such an skb is pointless and causses a kernel warning and aborts the SACK processing. This patch universally checks such over-split before calling tcp_fragment to prevent these unnecessary warnings. Fixes: adb92db857ee ("tcp: Make SACK code to split only at mss boundaries") Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7727ffeaaf9d..22335d812727 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1177,13 +1177,14 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, */ if (pkt_len > mss) { unsigned int new_len = (pkt_len / mss) * mss; - if (!in_sack && new_len < pkt_len) { + if (!in_sack && new_len < pkt_len) new_len += mss; - if (new_len >= skb->len) - return 0; - } pkt_len = new_len; } + + if (pkt_len >= skb->len && !in_sack) + return 0; + err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; From eb7f6d6989adecd24e8ed050513d4b5d3d4c9ece Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 12 May 2017 14:39:52 +0800 Subject: [PATCH 024/235] sctp: fix src address selection if using secondary addresses for ipv6 [ Upstream commit dbc2b5e9a09e9a6664679a667ff81cff6e5f2641 ] Commit 0ca50d12fe46 ("sctp: fix src address selection if using secondary addresses") has fixed a src address selection issue when using secondary addresses for ipv4. Now sctp ipv6 also has the similar issue. When using a secondary address, sctp_v6_get_dst tries to choose the saddr which has the most same bits with the daddr by sctp_v6_addr_match_len. It may make some cases not work as expected. hostA: [1] fd21:356b:459a:cf10::11 (eth1) [2] fd21:356b:459a:cf20::11 (eth2) hostB: [a] fd21:356b:459a:cf30::2 (eth1) [b] fd21:356b:459a:cf40::2 (eth2) route from hostA to hostB: fd21:356b:459a:cf30::/64 dev eth1 metric 1024 mtu 1500 The expected path should be: fd21:356b:459a:cf10::11 <-> fd21:356b:459a:cf30::2 But addr[2] matches addr[a] more bits than addr[1] does, according to sctp_v6_addr_match_len. It causes the path to be: fd21:356b:459a:cf20::11 <-> fd21:356b:459a:cf30::2 This patch is to fix it with the same way as Marcelo's fix for sctp ipv4. As no ip_dev_find for ipv6, this patch is to use ipv6_chk_addr to check if the saddr is in a dev instead. Note that for backwards compatibility, it will still do the addr_match_len check here when no optimal is found. Reported-by: Patrick Talbert Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/ipv6.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 6a2532dd31c4..b9cbdf57e8d2 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -240,12 +240,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct sctp_bind_addr *bp; struct ipv6_pinfo *np = inet6_sk(sk); struct sctp_sockaddr_entry *laddr; - union sctp_addr *baddr = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; struct in6_addr *final_p, final; __u8 matchlen = 0; - __u8 bmatchlen; sctp_scope_t scope; memset(fl6, 0, sizeof(struct flowi6)); @@ -312,23 +310,37 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { - if (!laddr->valid) + struct dst_entry *bdst; + __u8 bmatchlen; + + if (!laddr->valid || + laddr->state != SCTP_ADDR_SRC || + laddr->a.sa.sa_family != AF_INET6 || + scope > sctp_scope(&laddr->a)) continue; - if ((laddr->state == SCTP_ADDR_SRC) && - (laddr->a.sa.sa_family == AF_INET6) && - (scope <= sctp_scope(&laddr->a))) { - bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); - if (!baddr || (matchlen < bmatchlen)) { - baddr = &laddr->a; - matchlen = bmatchlen; - } - } - } - if (baddr) { - fl6->saddr = baddr->v6.sin6_addr; - fl6->fl6_sport = baddr->v6.sin6_port; + + fl6->saddr = laddr->a.v6.sin6_addr; + fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + bdst = ip6_dst_lookup_flow(sk, fl6, final_p); + + if (!IS_ERR(bdst) && + ipv6_chk_addr(dev_net(bdst->dev), + &laddr->a.v6.sin6_addr, bdst->dev, 1)) { + if (!IS_ERR_OR_NULL(dst)) + dst_release(dst); + dst = bdst; + break; + } + + bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); + if (matchlen > bmatchlen) + continue; + + if (!IS_ERR_OR_NULL(dst)) + dst_release(dst); + dst = bdst; + matchlen = bmatchlen; } rcu_read_unlock(); From 5e7d9f0b3f729a64b99e58047f7bb0ff36acb759 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 May 2017 07:16:40 -0700 Subject: [PATCH 025/235] sctp: do not inherit ipv6_{mc|ac|fl}_list from parent [ Upstream commit fdcee2cbb8438702ea1b328fb6e0ac5e9a40c7f8 ] SCTP needs fixes similar to 83eaddab4378 ("ipv6/dccp: do not inherit ipv6_mc_list from parent"), otherwise bad things can happen. Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/ipv6.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index b9cbdf57e8d2..0c090600f377 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -678,6 +678,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; rcu_read_lock(); opt = rcu_dereference(np->opt); From f79d3307c0356b63711d53d999158bdad667004b Mon Sep 17 00:00:00 2001 From: Douglas Caetano dos Santos Date: Fri, 12 May 2017 15:19:15 -0300 Subject: [PATCH 026/235] net/packet: fix missing net_device reference release [ Upstream commit d19b183cdc1fa3d70d6abe2a4c369e748cd7ebb8 ] When using a TX ring buffer, if an error occurs processing a control message (e.g. invalid message), the net_device reference is not released. Fixes c14ac9451c348 ("sock: enable timestamping using control messages") Signed-off-by: Douglas Caetano dos Santos Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index cb76ff3088e9..6a563e6e24de 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2652,13 +2652,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); } - sockc.tsflags = po->sk.sk_tsflags; - if (msg->msg_controllen) { - err = sock_cmsg_send(&po->sk, msg, &sockc); - if (unlikely(err)) - goto out; - } - err = -ENXIO; if (unlikely(dev == NULL)) goto out; @@ -2666,6 +2659,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; + sockc.tsflags = po->sk.sk_tsflags; + if (msg->msg_controllen) { + err = sock_cmsg_send(&po->sk, msg, &sockc); + if (unlikely(err)) + goto out_put; + } + if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size From 1594973b8e0700f187d367f95810ce3244a57ce4 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 3 Apr 2017 15:11:22 +0300 Subject: [PATCH 027/235] net/mlx5e: Use the correct pause values for ethtool advertising [ Upstream commit b383b544f2666d67446b951a9a97af239dafed5d ] Query the operational pause from firmware (PFCC register) instead of always passing zeros. Fixes: 665bc53969d7 ("net/mlx5e: Use new ethtool get/set link ksettings API") Signed-off-by: Gal Pressman Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 126cfeb7e0ec..6a9155f785ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -806,6 +806,8 @@ static int mlx5e_get_link_ksettings(struct net_device *netdev, struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0}; + u32 rx_pause = 0; + u32 tx_pause = 0; u32 eth_proto_cap; u32 eth_proto_admin; u32 eth_proto_lp; @@ -828,11 +830,13 @@ static int mlx5e_get_link_ksettings(struct net_device *netdev, an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin); an_status = MLX5_GET(ptys_reg, out, an_status); + mlx5_query_port_pause(mdev, &rx_pause, &tx_pause); + ethtool_link_ksettings_zero_link_mode(link_ksettings, supported); ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising); get_supported(eth_proto_cap, link_ksettings); - get_advertising(eth_proto_admin, 0, 0, link_ksettings); + get_advertising(eth_proto_admin, tx_pause, rx_pause, link_ksettings); get_speed_duplex(netdev, eth_proto_oper, link_ksettings); eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap; From ac3735bf97f0693f2cae8e835c704be2f2bf0ad5 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 19 Apr 2017 14:35:15 +0300 Subject: [PATCH 028/235] net/mlx5e: Fix ethtool pause support and advertise reporting [ Upstream commit e3c19503712d6360239b19c14cded56dd63c40d7 ] Pause bit should set when RX pause is on, not TX pause. Also, setting Asym_Pause is incorrect, and should be turned off. Fixes: 665bc53969d7 ("net/mlx5e: Use new ethtool get/set link ksettings API") Signed-off-by: Gal Pressman Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 6a9155f785ee..3744e2f79ecf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -751,7 +751,6 @@ static void get_supported(u32 eth_proto_cap, ptys2ethtool_supported_port(link_ksettings, eth_proto_cap); ptys2ethtool_supported_link(supported, eth_proto_cap); ethtool_link_ksettings_add_link_mode(link_ksettings, supported, Pause); - ethtool_link_ksettings_add_link_mode(link_ksettings, supported, Asym_Pause); } static void get_advertising(u32 eth_proto_cap, u8 tx_pause, @@ -761,7 +760,7 @@ static void get_advertising(u32 eth_proto_cap, u8 tx_pause, unsigned long *advertising = link_ksettings->link_modes.advertising; ptys2ethtool_adver_link(advertising, eth_proto_cap); - if (tx_pause) + if (rx_pause) ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Pause); if (tx_pause ^ rx_pause) ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Asym_Pause); From 0174b07408f28669fddb2c1eed3806115f45a758 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 15 May 2017 17:05:47 -0400 Subject: [PATCH 029/235] tcp: eliminate negative reordering in tcp_clean_rtx_queue [ Upstream commit bafbb9c73241760023d8981191ddd30bb1c6dbac ] tcp_ack() can call tcp_fragment() which may dededuct the value tp->fackets_out when MSS changes. When prior_fackets is larger than tp->fackets_out, tcp_clean_rtx_queue() can invoke tcp_update_reordering() with negative values. This results in absurd tp->reodering values higher than sysctl_tcp_max_reordering. Note that tcp_update_reordering indeeds sets tp->reordering to min(sysctl_tcp_max_reordering, metric), but because the comparison is signed, a negative metric always wins. Fixes: c7caf8d3ed7a ("[TCP]: Fix reord detection due to snd_una covered holes") Reported-by: Rebecca Isaacs Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 22335d812727..01336aa5f973 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3233,7 +3233,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets) + if (reord < prior_fackets && reord <= tp->fackets_out) tcp_update_reordering(sk, tp->fackets_out - reord, 0); delta = tcp_is_fack(tp) ? pkts_acked : From 68647616fd537b3f5dee698da49e2ce8c2c306e8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 15 May 2017 23:19:17 -0700 Subject: [PATCH 030/235] net: Improve handling of failures on link and route dumps [ Upstream commit f6c5775ff0bfa62b072face6bf1d40f659f194b2 ] In general, rtnetlink dumps do not anticipate failure to dump a single object (e.g., link or route) on a single pass. As both route and link objects have grown via more attributes, that is no longer a given. netlink dumps can handle a failure if the dump function returns an error; specifically, netlink_dump adds the return code to the response if it is <= 0 so userspace is notified of the failure. The missing piece is the rtnetlink dump functions returning the error. Fix route and link dump functions to return the errors if no object is added to an skb (detected by skb->len != 0). IPv6 route dumps (rt6_dump_route) already return the error; this patch updates IPv4 and link dumps. Other dump functions may need to be ajusted as well. Reported-by: Jan Moskyto Matejka Signed-off-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/rtnetlink.c | 36 ++++++++++++++++++++++++------------ net/ipv4/fib_frontend.c | 15 +++++++++++---- net/ipv4/fib_trie.c | 26 ++++++++++++++------------ 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b490af67c6fa..1d9160794e55 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1617,13 +1617,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, 0, flags, ext_filter_mask); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: @@ -1631,10 +1631,12 @@ cont: } } out: + err = skb->len; +out_err: cb->args[1] = idx; cb->args[0] = h; - return skb->len; + return err; } int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len) @@ -3413,8 +3415,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } @@ -3425,16 +3431,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } } + err = skb->len; +out_err: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline size_t bridge_nlmsg_size(void) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6789e48b7085..3d92534c4450 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -758,7 +758,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; - int dumped = 0; + int dumped = 0, err; if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) @@ -778,20 +778,27 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); - if (fib_table_dump(tb, skb, cb) < 0) - goto out; + err = fib_table_dump(tb, skb, cb); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } dumped = 1; next: e++; } } out: + err = skb->len; +out_err: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; - return skb->len; + return err; } /* Prepare and feed intra-kernel routing request. diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e3665bf7a7f3..ef40bb659a7a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1932,6 +1932,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + int err; + if (i < s_i) { i++; continue; @@ -1942,17 +1944,14 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, continue; } - if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, - fa->fa_info, NLM_F_MULTI) < 0) { + err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + tb->tb_id, fa->fa_type, + xkey, KEYLENGTH - fa->fa_slen, + fa->fa_tos, fa->fa_info, NLM_F_MULTI); + if (err < 0) { cb->args[4] = i; - return -1; + return err; } i++; } @@ -1974,10 +1973,13 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, t_key key = cb->args[3]; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + int err; + + err = fn_trie_dump_leaf(l, tb, skb, cb); + if (err < 0) { cb->args[3] = key; cb->args[2] = count; - return -1; + return err; } ++count; From a2c845e51a820549a6df5a1e8907ee754422119e Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 16 May 2017 14:36:23 -0400 Subject: [PATCH 031/235] ipv6: Prevent overrun when parsing v6 header options [ Upstream commit 2423496af35d94a87156b063ea5cedffc10a70a1 ] The KASAN warning repoted below was discovered with a syzkaller program. The reproducer is basically: int s = socket(AF_INET6, SOCK_RAW, NEXTHDR_HOP); send(s, &one_byte_of_data, 1, MSG_MORE); send(s, &more_than_mtu_bytes_data, 2000, 0); The socket() call sets the nexthdr field of the v6 header to NEXTHDR_HOP, the first send call primes the payload with a non zero byte of data, and the second send call triggers the fragmentation path. The fragmentation code tries to parse the header options in order to figure out where to insert the fragment option. Since nexthdr points to an invalid option, the calculation of the size of the network header can made to be much larger than the linear section of the skb and data is read outside of it. This fix makes ip6_find_1stfrag return an error if it detects running out-of-bounds. [ 42.361487] ================================================================== [ 42.364412] BUG: KASAN: slab-out-of-bounds in ip6_fragment+0x11c8/0x3730 [ 42.365471] Read of size 840 at addr ffff88000969e798 by task ip6_fragment-oo/3789 [ 42.366469] [ 42.366696] CPU: 1 PID: 3789 Comm: ip6_fragment-oo Not tainted 4.11.0+ #41 [ 42.367628] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014 [ 42.368824] Call Trace: [ 42.369183] dump_stack+0xb3/0x10b [ 42.369664] print_address_description+0x73/0x290 [ 42.370325] kasan_report+0x252/0x370 [ 42.370839] ? ip6_fragment+0x11c8/0x3730 [ 42.371396] check_memory_region+0x13c/0x1a0 [ 42.371978] memcpy+0x23/0x50 [ 42.372395] ip6_fragment+0x11c8/0x3730 [ 42.372920] ? nf_ct_expect_unregister_notifier+0x110/0x110 [ 42.373681] ? ip6_copy_metadata+0x7f0/0x7f0 [ 42.374263] ? ip6_forward+0x2e30/0x2e30 [ 42.374803] ip6_finish_output+0x584/0x990 [ 42.375350] ip6_output+0x1b7/0x690 [ 42.375836] ? ip6_finish_output+0x990/0x990 [ 42.376411] ? ip6_fragment+0x3730/0x3730 [ 42.376968] ip6_local_out+0x95/0x160 [ 42.377471] ip6_send_skb+0xa1/0x330 [ 42.377969] ip6_push_pending_frames+0xb3/0xe0 [ 42.378589] rawv6_sendmsg+0x2051/0x2db0 [ 42.379129] ? rawv6_bind+0x8b0/0x8b0 [ 42.379633] ? _copy_from_user+0x84/0xe0 [ 42.380193] ? debug_check_no_locks_freed+0x290/0x290 [ 42.380878] ? ___sys_sendmsg+0x162/0x930 [ 42.381427] ? rcu_read_lock_sched_held+0xa3/0x120 [ 42.382074] ? sock_has_perm+0x1f6/0x290 [ 42.382614] ? ___sys_sendmsg+0x167/0x930 [ 42.383173] ? lock_downgrade+0x660/0x660 [ 42.383727] inet_sendmsg+0x123/0x500 [ 42.384226] ? inet_sendmsg+0x123/0x500 [ 42.384748] ? inet_recvmsg+0x540/0x540 [ 42.385263] sock_sendmsg+0xca/0x110 [ 42.385758] SYSC_sendto+0x217/0x380 [ 42.386249] ? SYSC_connect+0x310/0x310 [ 42.386783] ? __might_fault+0x110/0x1d0 [ 42.387324] ? lock_downgrade+0x660/0x660 [ 42.387880] ? __fget_light+0xa1/0x1f0 [ 42.388403] ? __fdget+0x18/0x20 [ 42.388851] ? sock_common_setsockopt+0x95/0xd0 [ 42.389472] ? SyS_setsockopt+0x17f/0x260 [ 42.390021] ? entry_SYSCALL_64_fastpath+0x5/0xbe [ 42.390650] SyS_sendto+0x40/0x50 [ 42.391103] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.391731] RIP: 0033:0x7fbbb711e383 [ 42.392217] RSP: 002b:00007ffff4d34f28 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 42.393235] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fbbb711e383 [ 42.394195] RDX: 0000000000001000 RSI: 00007ffff4d34f60 RDI: 0000000000000003 [ 42.395145] RBP: 0000000000000046 R08: 00007ffff4d34f40 R09: 0000000000000018 [ 42.396056] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000400aad [ 42.396598] R13: 0000000000000066 R14: 00007ffff4d34ee0 R15: 00007fbbb717af00 [ 42.397257] [ 42.397411] Allocated by task 3789: [ 42.397702] save_stack_trace+0x16/0x20 [ 42.398005] save_stack+0x46/0xd0 [ 42.398267] kasan_kmalloc+0xad/0xe0 [ 42.398548] kasan_slab_alloc+0x12/0x20 [ 42.398848] __kmalloc_node_track_caller+0xcb/0x380 [ 42.399224] __kmalloc_reserve.isra.32+0x41/0xe0 [ 42.399654] __alloc_skb+0xf8/0x580 [ 42.400003] sock_wmalloc+0xab/0xf0 [ 42.400346] __ip6_append_data.isra.41+0x2472/0x33d0 [ 42.400813] ip6_append_data+0x1a8/0x2f0 [ 42.401122] rawv6_sendmsg+0x11ee/0x2db0 [ 42.401505] inet_sendmsg+0x123/0x500 [ 42.401860] sock_sendmsg+0xca/0x110 [ 42.402209] ___sys_sendmsg+0x7cb/0x930 [ 42.402582] __sys_sendmsg+0xd9/0x190 [ 42.402941] SyS_sendmsg+0x2d/0x50 [ 42.403273] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.403718] [ 42.403871] Freed by task 1794: [ 42.404146] save_stack_trace+0x16/0x20 [ 42.404515] save_stack+0x46/0xd0 [ 42.404827] kasan_slab_free+0x72/0xc0 [ 42.405167] kfree+0xe8/0x2b0 [ 42.405462] skb_free_head+0x74/0xb0 [ 42.405806] skb_release_data+0x30e/0x3a0 [ 42.406198] skb_release_all+0x4a/0x60 [ 42.406563] consume_skb+0x113/0x2e0 [ 42.406910] skb_free_datagram+0x1a/0xe0 [ 42.407288] netlink_recvmsg+0x60d/0xe40 [ 42.407667] sock_recvmsg+0xd7/0x110 [ 42.408022] ___sys_recvmsg+0x25c/0x580 [ 42.408395] __sys_recvmsg+0xd6/0x190 [ 42.408753] SyS_recvmsg+0x2d/0x50 [ 42.409086] entry_SYSCALL_64_fastpath+0x1f/0xbe [ 42.409513] [ 42.409665] The buggy address belongs to the object at ffff88000969e780 [ 42.409665] which belongs to the cache kmalloc-512 of size 512 [ 42.410846] The buggy address is located 24 bytes inside of [ 42.410846] 512-byte region [ffff88000969e780, ffff88000969e980) [ 42.411941] The buggy address belongs to the page: [ 42.412405] page:ffffea000025a780 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 [ 42.413298] flags: 0x100000000008100(slab|head) [ 42.413729] raw: 0100000000008100 0000000000000000 0000000000000000 00000001800c000c [ 42.414387] raw: ffffea00002a9500 0000000900000007 ffff88000c401280 0000000000000000 [ 42.415074] page dumped because: kasan: bad access detected [ 42.415604] [ 42.415757] Memory state around the buggy address: [ 42.416222] ffff88000969e880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 42.416904] ffff88000969e900: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 42.417591] >ffff88000969e980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 42.418273] ^ [ 42.418588] ffff88000969ea00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 42.419273] ffff88000969ea80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 42.419882] ================================================================== Reported-by: Andrey Konovalov Signed-off-by: Craig Gallek Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_offload.c | 2 ++ net/ipv6/ip6_output.c | 4 ++++ net/ipv6/output_core.c | 14 ++++++++------ net/ipv6/udp_offload.c | 2 ++ 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 33b04ec2744a..9881a87696bc 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -117,6 +117,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (udpfrag) { unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (unfrag_ip6hlen < 0) + return ERR_PTR(unfrag_ip6hlen); fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); fptr->frag_off = htons(offset); if (skb->next) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e27b8fdba5d2..6e0bb77fdc9f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -587,6 +587,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, u8 *prevhdr, nexthdr = 0; hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (hlen < 0) { + err = hlen; + goto fail; + } nexthdr = *prevhdr; mtu = ip6_skb_dst_mtu(skb); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index cd4252346a32..e9065b8d3af8 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -79,14 +79,13 @@ EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; - while (offset + 1 <= packet_len) { + while (offset <= packet_len) { + struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { @@ -107,13 +106,16 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; + if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) + return -EINVAL; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; } - return offset; + return -EINVAL; } EXPORT_SYMBOL(ip6_find_1stfragopt); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index ac858c480f2f..b348cff47395 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -91,6 +91,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, * bytes to insert fragment header. */ unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + if (unfrag_ip6hlen < 0) + return ERR_PTR(unfrag_ip6hlen); nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + From 3fa202ef74c83f9c44057b370dccc0981e55d62b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 17 May 2017 22:54:11 -0400 Subject: [PATCH 032/235] ipv6: Check ip6_find_1stfragopt() return value properly. [ Upstream commit 7dd7eb9513bd02184d45f000ab69d78cb1fa1531 ] Do not use unsigned variables to see if it returns a negative error or not. Fixes: 2423496af35d ("ipv6: Prevent overrun when parsing v6 header options") Reported-by: Julia Lawall Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_offload.c | 9 ++++----- net/ipv6/ip6_output.c | 7 +++---- net/ipv6/udp_offload.c | 8 +++++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 9881a87696bc..013086b248e2 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -63,7 +63,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, const struct net_offload *ops; int proto; struct frag_hdr *fptr; - unsigned int unfrag_ip6hlen; unsigned int payload_len; u8 *prevhdr; int offset = 0; @@ -116,10 +115,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, skb->network_header = (u8 *)ipv6h - skb->head; if (udpfrag) { - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (unfrag_ip6hlen < 0) - return ERR_PTR(unfrag_ip6hlen); - fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); + int err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 6e0bb77fdc9f..b1e8ee569f65 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -586,11 +586,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; - hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (hlen < 0) { - err = hlen; + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) goto fail; - } + hlen = err; nexthdr = *prevhdr; mtu = ip6_skb_dst_mtu(skb); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b348cff47395..a2267f80febb 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -29,6 +29,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u8 frag_hdr_sz = sizeof(struct frag_hdr); __wsum csum; int tnl_hlen; + int err; mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -90,9 +91,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, /* Find the unfragmentable header and shift it left by frag_hdr_sz * bytes to insert fragment header. */ - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - if (unfrag_ip6hlen < 0) - return ERR_PTR(unfrag_ip6hlen); + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + unfrag_ip6hlen = err; nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + From 2ea4221eb4aec92db6c9a1b579248f52374f8725 Mon Sep 17 00:00:00 2001 From: Tobias Jungel Date: Wed, 17 May 2017 09:29:12 +0200 Subject: [PATCH 033/235] bridge: netlink: check vlan_default_pvid range [ Upstream commit a285860211bf257b0e6d522dac6006794be348af ] Currently it is allowed to set the default pvid of a bridge to a value above VLAN_VID_MASK (0xfff). This patch adds a check to br_validate and returns -EINVAL in case the pvid is out of bounds. Reproduce by calling: [root@test ~]# ip l a type bridge [root@test ~]# ip l a type dummy [root@test ~]# ip l s bridge0 type bridge vlan_filtering 1 [root@test ~]# ip l s bridge0 type bridge vlan_default_pvid 9999 [root@test ~]# ip l s dummy0 master bridge0 [root@test ~]# bridge vlan port vlan ids bridge0 9999 PVID Egress Untagged dummy0 9999 PVID Egress Untagged Fixes: 0f963b7592ef ("bridge: netlink: add support for default_pvid") Acked-by: Nikolay Aleksandrov Signed-off-by: Tobias Jungel Acked-by: Sabrina Dubroca Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 04741064a173..7625ec8458de 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -776,6 +776,13 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return -EPROTONOSUPPORT; } } + + if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { + __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); + + if (defpvid >= VLAN_VID_MASK) + return -EINVAL; + } #endif return 0; From 0d10ebbc835f22cc27750c78f5dfe02651bec31f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 17 May 2017 16:31:41 +0200 Subject: [PATCH 034/235] qmi_wwan: add another Lenovo EM74xx device ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 486181bcb3248e2f1977f4e69387a898234a4e1e ] In their infinite wisdom, and never ending quest for end user frustration, Lenovo has decided to use a new USB device ID for the wwan modules in their 2017 laptops. The actual hardware is still the Sierra Wireless EM7455 or EM7430, depending on region. Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/qmi_wwan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 34d997ca1b27..2f260c63c383 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -897,6 +897,8 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1199, 0x9071, 10)}, /* Sierra Wireless MC74xx */ {QMI_FIXED_INTF(0x1199, 0x9079, 8)}, /* Sierra Wireless EM74xx */ {QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */ + {QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */ + {QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */ {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */ {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */ {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ From ee72e7e5c2b472ff9321c800554d1538fc09b8b4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 19 May 2017 22:20:29 +0800 Subject: [PATCH 035/235] bridge: start hello_timer when enabling KERNEL_STP in br_stp_start [ Upstream commit 6d18c732b95c0a9d35e9f978b4438bba15412284 ] Since commit 76b91c32dd86 ("bridge: stp: when using userspace stp stop kernel hello and hold timers"), bridge would not start hello_timer if stp_enabled is not KERNEL_STP when br_dev_open. The problem is even if users set stp_enabled with KERNEL_STP later, the timer will still not be started. It causes that KERNEL_STP can not really work. Users have to re-ifup the bridge to avoid this. This patch is to fix it by starting br->hello_timer when enabling KERNEL_STP in br_stp_start. As an improvement, it's also to start hello_timer again only when br->stp_enabled is KERNEL_STP in br_hello_timer_expired, there is no reason to start the timer again when it's NO_STP. Fixes: 76b91c32dd86 ("bridge: stp: when using userspace stp stop kernel hello and hold timers") Reported-by: Haidong Li Signed-off-by: Xin Long Acked-by: Nikolay Aleksandrov Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_stp_if.c | 1 + net/bridge/br_stp_timer.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index d8ad73b38de2..5a782f543aff 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -185,6 +185,7 @@ static void br_stp_start(struct net_bridge *br) br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ + mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index da058b85aa22..15826fd52af5 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -40,7 +40,7 @@ static void br_hello_timer_expired(unsigned long arg) if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); - if (br->stp_enabled != BR_USER_STP) + if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); } From 304b41014acbdc5fa5126c86bac31dc41a245f9f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 May 2017 14:17:48 -0700 Subject: [PATCH 036/235] ipv6: fix out of bound writes in __ip6_append_data() [ Upstream commit 232cd35d0804cc241eb887bb8d4d9b3b9881c64a ] Andrey Konovalov and idaifish@gmail.com reported crashes caused by one skb shared_info being overwritten from __ip6_append_data() Andrey program lead to following state : copy -4200 datalen 2000 fraglen 2040 maxfraglen 2040 alloclen 2048 transhdrlen 0 offset 0 fraggap 6200 The skb_copy_and_csum_bits(skb_prev, maxfraglen, data + transhdrlen, fraggap, 0); is overwriting skb->head and skb_shared_info Since we apparently detect this rare condition too late, move the code earlier to even avoid allocating skb and risking crashes. Once again, many thanks to Andrey and syzkaller team. Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Reported-by: Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b1e8ee569f65..1ac3cea49171 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1447,6 +1447,11 @@ alloc_new_skb: */ alloclen += sizeof(struct frag_hdr); + copy = datalen - transhdrlen - fraggap; + if (copy < 0) { + err = -EINVAL; + goto error; + } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len, @@ -1496,13 +1501,9 @@ alloc_new_skb: data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } - copy = datalen - transhdrlen - fraggap; - - if (copy < 0) { - err = -EINVAL; - kfree_skb(skb); - goto error; - } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + if (copy > 0 && + getfrag(from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; From 1730a2b9e5b5df516c6c6e90b03ee517d06524ad Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 19 May 2017 19:43:45 -0400 Subject: [PATCH 037/235] bonding: fix accounting of active ports in 3ad [ Upstream commit 751da2a69b7cc82d83dc310ed7606225f2d6e014 ] As of 7bb11dc9f59d and 0622cab0341c, bond slaves in a 3ad bond are not removed from the aggregator when they are down, and the active slave count is NOT equal to number of ports in the aggregator, but rather the number of ports in the aggregator that are still enabled. The sysfs spew for bonding_show_ad_num_ports() has a comment that says "Show number of active 802.3ad ports.", but it's currently showing total number of ports, both active and inactive. Remedy it by using the same logic introduced in 0622cab0341c in __bond_3ad_get_active_agg_info(), so sysfs, procfs and netlink all report the number of active ports. Note that this means that IFLA_BOND_AD_INFO_NUM_PORTS really means NUM_ACTIVE_PORTS instead of NUM_PORTS, and thus perhaps should be renamed for clarity. Lightly tested on a dual i40e lacp bond, simulating link downs with an ip link set dev down, was able to produce the state where I could see both in the same aggregator, but a number of ports count of 1. MII Status: up Active Aggregator Info: Aggregator ID: 1 Number of ports: 2 <--- Slave Interface: ens10 MII Status: up <--- Aggregator ID: 1 Slave Interface: ens11 MII Status: up Aggregator ID: 1 MII Status: up Active Aggregator Info: Aggregator ID: 1 Number of ports: 1 <--- Slave Interface: ens10 MII Status: down <--- Aggregator ID: 1 Slave Interface: ens11 MII Status: up Aggregator ID: 1 CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/bonding/bond_3ad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index edc70ffad660..6dcc42d79cab 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2573,7 +2573,7 @@ int __bond_3ad_get_active_agg_info(struct bonding *bond, return -1; ad_info->aggregator_id = aggregator->aggregator_identifier; - ad_info->ports = aggregator->num_of_ports; + ad_info->ports = __agg_active_ports(aggregator); ad_info->actor_key = aggregator->actor_oper_aggregator_key; ad_info->partner_key = aggregator->partner_oper_aggregator_key; ether_addr_copy(ad_info->partner_system, From 4fb5fd27dec09083c5badb5031972a54bc464937 Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Thu, 23 Feb 2017 11:19:36 +0200 Subject: [PATCH 038/235] net/mlx5: Avoid using pending command interface slots [ Upstream commit 73dd3a4839c1d27c36d4dcc92e1ff44225ecbeb7 ] Currently when firmware command gets stuck or it takes long time to complete, the driver command will get timeout and the command slot is freed and can be used for new commands, and if the firmware receive new command on the old busy slot its behavior is unexpected and this could be harmful. To fix this when the driver command gets timeout we return failure, but we don't free the command slot and we wait for the firmware to explicitly respond to that command. Once all the entries are busy we will stop processing new firmware commands. Fixes: 9cba4ebcf374 ('net/mlx5: Fix potential deadlock in command mode change') Signed-off-by: Mohamad Haj Yahia Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 41 ++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- .../net/ethernet/mellanox/mlx5/core/health.c | 2 +- include/linux/mlx5/driver.h | 7 +++- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 3f51a44bde6b..cb45390c7623 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -767,7 +767,7 @@ static void cb_timeout_handler(struct work_struct *work) mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } static void cmd_work_handler(struct work_struct *work) @@ -797,6 +797,7 @@ static void cmd_work_handler(struct work_struct *work) } cmd->ent_arr[ent->idx] = ent; + set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); lay = get_inst(cmd, ent->idx); ent->lay = lay; memset(lay, 0, sizeof(*lay)); @@ -818,6 +819,20 @@ static void cmd_work_handler(struct work_struct *work) if (ent->callback) schedule_delayed_work(&ent->cb_timeout_work, cb_timeout); + /* Skip sending command to fw if internal error */ + if (pci_channel_offline(dev->pdev) || + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + u8 status = 0; + u32 drv_synd; + + ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status); + MLX5_SET(mbox_out, ent->out, status, status); + MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); + + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + return; + } + /* ring doorbell after the descriptor is valid */ mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); wmb(); @@ -828,7 +843,7 @@ static void cmd_work_handler(struct work_struct *work) poll_timeout(ent); /* make sure we read the descriptor after ownership is SW */ rmb(); - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT)); } } @@ -872,7 +887,7 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) wait_for_completion(&ent->done); } else if (!wait_for_completion_timeout(&ent->done, timeout)) { ent->ret = -ETIMEDOUT; - mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } err = ent->ret; @@ -1369,7 +1384,7 @@ static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg) } } -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced) { struct mlx5_cmd *cmd = &dev->cmd; struct mlx5_cmd_work_ent *ent; @@ -1389,6 +1404,19 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) struct semaphore *sem; ent = cmd->ent_arr[i]; + + /* if we already completed the command, ignore it */ + if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, + &ent->state)) { + /* only real completion can free the cmd slot */ + if (!forced) { + mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", + ent->idx); + free_ent(cmd, ent->idx); + } + continue; + } + if (ent->callback) cancel_delayed_work(&ent->cb_timeout_work); if (ent->page_queue) @@ -1411,7 +1439,10 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec) mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n", ent->ret, deliv_status_to_str(ent->status), ent->status); } - free_ent(cmd, ent->idx); + + /* only real completion will free the entry slot */ + if (!forced) + free_ent(cmd, ent->idx); if (ent->callback) { ds = ent->ts2 - ent->ts1; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index aaca09002ca6..f86e9ff995be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -234,7 +234,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) break; case MLX5_EVENT_TYPE_CMD: - mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector)); + mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false); break; case MLX5_EVENT_TYPE_PORT_CHANGE: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 5bcf93422ee0..2115c8aacc5b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -90,7 +90,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev) spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); mlx5_core_dbg(dev, "vector 0x%llx\n", vector); - mlx5_cmd_comp_handler(dev, vector); + mlx5_cmd_comp_handler(dev, vector, true); return; no_trig: diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ecc451d89ccd..e1a903a5bb3e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -640,7 +640,12 @@ enum { typedef void (*mlx5_cmd_cbk_t)(int status, void *context); +enum { + MLX5_CMD_ENT_STATE_PENDING_COMP, +}; + struct mlx5_cmd_work_ent { + unsigned long state; struct mlx5_cmd_msg *in; struct mlx5_cmd_msg *out; void *uout; @@ -838,7 +843,7 @@ void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe); #endif void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec); +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced); void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type); int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, int nent, u64 mask, const char *name, struct mlx5_uar *uar); From cc6773b51bf318fb775106fec51d66906e345fac Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 23 May 2017 17:49:13 +0200 Subject: [PATCH 039/235] net: phy: marvell: Limit errata to 88m1101 [ Upstream commit f2899788353c13891412b273fdff5f02d49aa40f ] The 88m1101 has an errata when configuring autoneg. However, it was being applied to many other Marvell PHYs as well. Limit its scope to just the 88m1101. Fixes: 76884679c644 ("phylib: Add support for Marvell 88e1111S and 88e1145") Reported-by: Daniel Walker Signed-off-by: Andrew Lunn Acked-by: Harini Katakam Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/marvell.c | 66 ++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index c2dcf02df202..d6a541bde331 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -240,34 +240,6 @@ static int marvell_config_aneg(struct phy_device *phydev) { int err; - /* The Marvell PHY has an errata which requires - * that certain registers get written in order - * to restart autonegotiation */ - err = phy_write(phydev, MII_BMCR, BMCR_RESET); - - if (err < 0) - return err; - - err = phy_write(phydev, 0x1d, 0x1f); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0x200c); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1d, 0x5); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0); - if (err < 0) - return err; - - err = phy_write(phydev, 0x1e, 0x100); - if (err < 0) - return err; - err = marvell_set_polarity(phydev, phydev->mdix); if (err < 0) return err; @@ -301,6 +273,42 @@ static int marvell_config_aneg(struct phy_device *phydev) return 0; } +static int m88e1101_config_aneg(struct phy_device *phydev) +{ + int err; + + /* This Marvell PHY has an errata which requires + * that certain registers get written in order + * to restart autonegotiation + */ + err = phy_write(phydev, MII_BMCR, BMCR_RESET); + + if (err < 0) + return err; + + err = phy_write(phydev, 0x1d, 0x1f); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0x200c); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1d, 0x5); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0); + if (err < 0) + return err; + + err = phy_write(phydev, 0x1e, 0x100); + if (err < 0) + return err; + + return marvell_config_aneg(phydev); +} + static int m88e1111_config_aneg(struct phy_device *phydev) { int err; @@ -1491,7 +1499,7 @@ static struct phy_driver marvell_drivers[] = { .probe = marvell_probe, .flags = PHY_HAS_INTERRUPT, .config_init = &marvell_config_init, - .config_aneg = &marvell_config_aneg, + .config_aneg = &m88e1101_config_aneg, .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, From 5f595d5297961c0c641e76f78bb39d1618639d37 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:41 -0400 Subject: [PATCH 040/235] vlan: Fix tcp checksum offloads in Q-in-Q vlans [ Upstream commit 35d2f80b07bbe03fb358afb0bdeff7437a7d67ff ] It appears that TCP checksum offloading has been broken for Q-in-Q vlans. The behavior was execerbated by the series commit afb0bc972b52 ("Merge branch 'stacked_vlan_tso'") that that enabled accleleration features on stacked vlans. However, event without that series, it is possible to trigger this issue. It just requires a lot more specialized configuration. The root cause is the interaction between how netdev_intersect_features() works, the features actually set on the vlan devices and HW having the ability to run checksum with longer headers. The issue starts when netdev_interesect_features() replaces NETIF_F_HW_CSUM with a combination of NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM, if the HW advertises IP|IPV6 specific checksums. This happens for tagged and multi-tagged packets. However, HW that enables IP|IPV6 checksum offloading doesn't gurantee that packets with arbitrarily long headers can be checksummed. This patch disables IP|IPV6 checksums on the packet for multi-tagged packets. CC: Toshiaki Makita CC: Michal Kubecek Signed-off-by: Vladislav Yasevich Acked-by: Toshiaki Makita Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/if_vlan.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 3319d97d789d..8feecd5345e7 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -630,14 +630,16 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb) static inline netdev_features_t vlan_features_check(const struct sk_buff *skb, netdev_features_t features) { - if (skb_vlan_tagged_multi(skb)) - features = netdev_intersect_features(features, - NETIF_F_SG | - NETIF_F_HIGHDMA | - NETIF_F_FRAGLIST | - NETIF_F_HW_CSUM | - NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + if (skb_vlan_tagged_multi(skb)) { + /* In the case of multi-tagged packets, use a direct mask + * instead of using netdev_interesect_features(), to make + * sure that only devices supporting NETIF_F_HW_CSUM will + * have checksum offloading support. + */ + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | + NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + } return features; } From 9c6cfd5811bd53c4ef1800b768397d38c4790fbf Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:42 -0400 Subject: [PATCH 041/235] be2net: Fix offload features for Q-in-Q packets [ Upstream commit cc6e9de62a7f84c9293a2ea41bc412b55bb46e85 ] At least some of the be2net cards do not seem to be capabled of performing checksum offload computions on Q-in-Q packets. In these case, the recevied checksum on the remote is invalid and TCP syn packets are dropped. This patch adds a call to check disbled acceleration features on Q-in-Q tagged traffic. CC: Sathya Perla CC: Ajit Khaparde CC: Sriharsha Basavapatna CC: Somnath Kotur Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/emulex/benet/be_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 93aa2939142a..9711ca4510fa 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5144,9 +5144,11 @@ static netdev_features_t be_features_check(struct sk_buff *skb, struct be_adapter *adapter = netdev_priv(dev); u8 l4_hdr = 0; - /* The code below restricts offload features for some tunneled packets. + /* The code below restricts offload features for some tunneled and + * Q-in-Q packets. * Offload features for normal (non tunnel) packets are unchanged. */ + features = vlan_features_check(skb, features); if (!skb->encapsulation || !(adapter->flags & BE_FLAGS_VXLAN_OFFLOADS)) return features; From 9e056584770b021eb82fe378c65c12e5f71f8e68 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 23 May 2017 13:38:43 -0400 Subject: [PATCH 042/235] virtio-net: enable TSO/checksum offloads for Q-in-Q vlans [ Upstream commit 2836b4f224d4fd7d1a2b23c3eecaf0f0ae199a74 ] Since virtio does not provide it's own ndo_features_check handler, TSO, and now checksum offload, are disabled for stacked vlans. Re-enable the support and let the host take care of it. This restores/improves Guest-to-Guest performance over Q-in-Q vlans. Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/virtio_net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 51fc0c33a62f..7ca99899972e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1456,6 +1456,7 @@ static const struct net_device_ops virtnet_netdev = { #ifdef CONFIG_NET_RX_BUSY_POLL .ndo_busy_poll = virtnet_busy_poll, #endif + .ndo_features_check = passthru_features_check, }; static void virtnet_config_changed_work(struct work_struct *work) From 4b81271ed1c3f60b37196b381fd4bca47a20cf0e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 24 May 2017 09:59:31 -0700 Subject: [PATCH 043/235] tcp: avoid fastopen API to be used on AF_UNSPEC [ Upstream commit ba615f675281d76fd19aa03558777f81fb6b6084 ] Fastopen API should be used to perform fastopen operations on the TCP socket. It does not make sense to use fastopen API to perform disconnect by calling it with AF_UNSPEC. The fastopen data path is also prone to race conditions and bugs when using with AF_UNSPEC. One issue reported and analyzed by Vegard Nossum is as follows: +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Thread A: Thread B: ------------------------------------------------------------------------ sendto() - tcp_sendmsg() - sk_stream_memory_free() = 0 - goto wait_for_sndbuf - sk_stream_wait_memory() - sk_wait_event() // sleep | sendto(flags=MSG_FASTOPEN, dest_addr=AF_UNSPEC) | - tcp_sendmsg() | - tcp_sendmsg_fastopen() | - __inet_stream_connect() | - tcp_disconnect() //because of AF_UNSPEC | - tcp_transmit_skb()// send RST | - return 0; // no reconnect! | - sk_stream_wait_connect() | - sock_error() | - xchg(&sk->sk_err, 0) | - return -ECONNRESET - ... // wake up, see sk->sk_err == 0 - skb_entail() on TCP_CLOSE socket If the connection is reopened then we will send a brand new SYN packet after thread A has already queued a buffer. At this point I think the socket internal state (sequence numbers etc.) becomes messed up. When the new connection is closed, the FIN-ACK is rejected because the sequence number is outside the window. The other side tries to retransmit, but __tcp_retransmit_skb() calls tcp_trim_head() on an empty skb which corrupts the skb data length and hits a BUG() in copy_and_csum_bits(). +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Hence, this patch adds a check for AF_UNSPEC in the fastopen data path and return EOPNOTSUPP to user if such case happens. Fixes: cf60af03ca4e7 ("tcp: Fast Open client - sendmsg(MSG_FASTOPEN)") Reported-by: Vegard Nossum Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eb142ca71fc5..86fbf0f3235e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1078,9 +1078,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size) { struct tcp_sock *tp = tcp_sk(sk); + struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && + uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ @@ -1093,7 +1096,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, tp->fastopen_req->size = size; flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags); *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); From 1de51502a02598a8deaced6ab7575089108ec823 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 25 May 2017 19:14:56 +0200 Subject: [PATCH 044/235] sctp: fix ICMP processing if skb is non-linear [ Upstream commit 804ec7ebe8ea003999ca8d1bfc499edc6a9e07df ] sometimes ICMP replies to INIT chunks are ignored by the client, even if the encapsulated SCTP headers match an open socket. This happens when the ICMP packet is carried by a paged skb: use skb_header_pointer() to read packet contents beyond the SCTP header, so that chunk header and initiate tag are validated correctly. v2: - don't use skb_header_pointer() to read the transport header, since icmp_socket_deliver() already puts these 8 bytes in the linear area. - change commit message to make specific reference to INIT chunks. Signed-off-by: Davide Caratti Acked-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Reviewed-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/input.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index a01a56ec8b8c..6c79915c7dbc 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -473,15 +473,14 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctp_association **app, struct sctp_transport **tpp) { + struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; - struct sctp_init_chunk *chunkhdr; __u32 vtag = ntohl(sctphdr->vtag); - int len = skb->len - ((void *)sctphdr - (void *)skb->data); *app = NULL; *tpp = NULL; @@ -516,13 +515,16 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, * discard the packet. */ if (vtag == 0) { - chunkhdr = (void *)sctphdr + sizeof(struct sctphdr); - if (len < sizeof(struct sctphdr) + sizeof(sctp_chunkhdr_t) - + sizeof(__be32) || + /* chunk header + first 4 octects of init header */ + chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr), + sizeof(struct sctp_chunkhdr) + + sizeof(__be32), &_chunkhdr); + if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || - ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) { + ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; - } + } else if (vtag != asoc->c.peer_vtag) { goto out; } From 988b9792b8569871ccaabc6ec0bc4d70f982be96 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 May 2017 14:27:35 -0700 Subject: [PATCH 045/235] ipv4: add reference counting to metrics [ Upstream commit 3fb07daff8e99243366a081e5129560734de4ada ] Andrey Konovalov reported crashes in ipv4_mtu() I could reproduce the issue with KASAN kernels, between 10.246.7.151 and 10.246.7.152 : 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 & 2) At the same time run following loop : while : do ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 done Cong Wang attempted to add back rt->fi in commit 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting") but this proved to add some issues that were complex to solve. Instead, I suggested to add a refcount to the metrics themselves, being a standalone object (in particular, no reference to other objects) I tried to make this patch as small as possible to ease its backport, instead of being super clean. Note that we believe that only ipv4 dst need to take care of the metric refcount. But if this is wrong, this patch adds the basic infrastructure to extend this to other families. Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang for his efforts on this problem. Fixes: 2860583fe840 ("ipv4: Kill rt->fi") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Reviewed-by: Julian Anastasov Acked-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/dst.h | 8 +++++++- include/net/ip_fib.h | 10 +++++----- net/core/dst.c | 23 ++++++++++++++--------- net/ipv4/fib_semantics.c | 17 ++++++++++------- net/ipv4/route.c | 10 +++++++++- 5 files changed, 45 insertions(+), 23 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 6835d224d47b..ddcff17615da 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -107,10 +107,16 @@ struct dst_entry { }; }; +struct dst_metrics { + u32 metrics[RTAX_MAX]; + atomic_t refcnt; +}; +extern const struct dst_metrics dst_default_metrics; + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); -extern const u32 dst_default_metrics[]; #define DST_METRICS_READ_ONLY 0x1UL +#define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f390c3bb05c5..aa758280d8a8 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -114,11 +114,11 @@ struct fib_info { __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; - u32 *fib_metrics; -#define fib_mtu fib_metrics[RTAX_MTU-1] -#define fib_window fib_metrics[RTAX_WINDOW-1] -#define fib_rtt fib_metrics[RTAX_RTT-1] -#define fib_advmss fib_metrics[RTAX_ADVMSS-1] + struct dst_metrics *fib_metrics; +#define fib_mtu fib_metrics->metrics[RTAX_MTU-1] +#define fib_window fib_metrics->metrics[RTAX_WINDOW-1] +#define fib_rtt fib_metrics->metrics[RTAX_RTT-1] +#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_weight; diff --git a/net/core/dst.c b/net/core/dst.c index b5cbbe07f786..656b70d39690 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ - [RTAX_MAX] = 0xdeadbeef, + .refcnt = ATOMIC_INIT(1), }; void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, if (dev) dev_hold(dev); dst->ops = ops; - dst_init_metrics(dst, dst_default_metrics, true); + dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; dst->path = dst; dst->from = NULL; @@ -315,25 +315,30 @@ EXPORT_SYMBOL(dst_release); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { - u32 *old_p = __DST_METRICS_PTR(old); + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + atomic_set(&p->refcnt, 1); + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); - p = __DST_METRICS_PTR(prev); + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; + } else if (prev & DST_METRICS_REFCOUNTED) { + if (atomic_dec_and_test(&old_p->refcnt)) + kfree(old_p); } } - return p; + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); + return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -342,7 +347,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6a4068031aaa..7563831fa432 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -204,6 +204,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -214,8 +215,9 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); + m = fi->fib_metrics; + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) + kfree(m); kfree(fi); } @@ -982,11 +984,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; - fi->fib_metrics[type - 1] = val; + fi->fib_metrics->metrics[type - 1] = val; } if (ecn_ca) - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } @@ -1044,11 +1046,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; fib_info_cnt++; if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); if (!fi->fib_metrics) goto failure; + atomic_set(&fi->fib_metrics->refcnt, 1); } else - fi->fib_metrics = (u32 *) dst_default_metrics; + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1252,7 +1255,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6dbcb37753d7..6cd49fd17ac0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1364,8 +1364,12 @@ static void rt_add_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *) dst; + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) + kfree(p); + if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1417,7 +1421,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); + if (fi->fib_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + atomic_inc(&fi->fib_metrics->refcnt); + } #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif From c1133c671a0467a8cc2fc79bdc34c764fb623043 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:07 +0200 Subject: [PATCH 046/235] bpf: add bpf_clone_redirect to bpf_helper_changes_pkt_data [ Upstream commit 41703a731066fde79c3e5ccf3391cf77a98aeda5 ] The bpf_clone_redirect() still needs to be listed in bpf_helper_changes_pkt_data() since we call into bpf_try_make_head_writable() from there, thus we need to invalidate prior pkt regs as well. Fixes: 36bbef52c7eb ("bpf: direct packet write and access for helpers for clsact progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index b391209838ef..4eb4ce0aeef4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2198,6 +2198,7 @@ bool bpf_helper_changes_skb_data(void *func) func == bpf_skb_change_proto || func == bpf_skb_change_tail || func == bpf_skb_pull_data || + func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace) return true; From 45ceb845ef34984357e6c2e0170570cc6f7179a3 Mon Sep 17 00:00:00 2001 From: Orlando Arias Date: Tue, 16 May 2017 15:34:00 -0400 Subject: [PATCH 047/235] sparc: Fix -Wstringop-overflow warning [ Upstream commit deba804c90642c8ed0f15ac1083663976d578f54 ] Greetings, GCC 7 introduced the -Wstringop-overflow flag to detect buffer overflows in calls to string handling functions [1][2]. Due to the way ``empty_zero_page'' is declared in arch/sparc/include/setup.h, this causes a warning to trigger at compile time in the function mem_init(), which is subsequently converted to an error. The ensuing patch fixes this issue and aligns the declaration of empty_zero_page to that of other architectures. Thank you. Cheers, Orlando. [1] https://gcc.gnu.org/ml/gcc-patches/2016-10/msg02308.html [2] https://gcc.gnu.org/gcc-7/changes.html Signed-off-by: Orlando Arias -------------------------------------------------------------------------------- Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/pgtable_32.h | 4 ++-- arch/sparc/include/asm/setup.h | 2 +- arch/sparc/mm/init_32.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index ce6f56980aef..cf190728360b 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -91,9 +91,9 @@ extern unsigned long pfn_base; * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(&empty_zero_page)) +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) /* * In general all page table modifications should use the V8 atomic diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index 29d64b1758ed..be0cc1beed41 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -16,7 +16,7 @@ extern char reboot_command[]; */ extern unsigned char boot_cpu_id; -extern unsigned long empty_zero_page; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; extern int serial_console; static inline int con_is_present(void) diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index eb8287155279..3b7092d9ea8f 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -301,7 +301,7 @@ void __init mem_init(void) /* Saves us work later. */ - memset((void *)&empty_zero_page, 0, PAGE_SIZE); + memset((void *)empty_zero_page, 0, PAGE_SIZE); i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5); i += 1; From 1a658771d5e129c64b34ce22d3018665d631c173 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 17 May 2017 11:47:00 -0400 Subject: [PATCH 048/235] sparc/ftrace: Fix ftrace graph time measurement [ Upstream commit 48078d2dac0a26f84f5f3ec704f24f7c832cce14 ] The ftrace function_graph time measurements of a given function is not accurate according to those recorded by ftrace using the function filters. This change pulls the x86_64 fix from 'commit 722b3c746953 ("ftrace/graph: Trace function entry before updating index")' into the sparc specific prepare_ftrace_return which stops ftrace from counting interrupted tasks in the time measurement. Example measurements for select_task_rq_fair running "hackbench 100 process 1000": | tracing/trace_stat/function0 | function_graph Before patch | 2.802 us | 4.255 us After patch | 2.749 us | 3.094 us Signed-off-by: Liam R. Howlett Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/kernel/ftrace.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c index 6bcff698069b..cec54dc4ab81 100644 --- a/arch/sparc/kernel/ftrace.c +++ b/arch/sparc/kernel/ftrace.c @@ -130,18 +130,17 @@ unsigned long prepare_ftrace_return(unsigned long parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return parent + 8UL; + trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + return parent + 8UL; + if (ftrace_push_return_trace(parent, self_addr, &trace.depth, frame_pointer, NULL) == -EBUSY) return parent + 8UL; - trace.func = self_addr; - - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) { - current->curr_ret_stack--; - return parent + 8UL; - } - return return_hooker; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ From 68a056175512222019e9959c2504f7c9ad53bbc0 Mon Sep 17 00:00:00 2001 From: Richard Narron Date: Sun, 4 Jun 2017 16:23:18 -0700 Subject: [PATCH 049/235] fs/ufs: Set UFS default maximum bytes per file commit 239e250e4acbc0104d514307029c0839e834a51a upstream. This fixes a problem with reading files larger than 2GB from a UFS-2 file system: https://bugzilla.kernel.org/show_bug.cgi?id=195721 The incorrect UFS s_maxsize limit became a problem as of commit c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()") which started using s_maxbytes to avoid a page index overflow in do_generic_file_read(). That caused files to be truncated on UFS-2 file systems because the default maximum file size is 2GB (MAX_NON_LFS) and UFS didn't update it. Here I simply increase the default to a common value used by other file systems. Signed-off-by: Richard Narron Cc: Al Viro Cc: Will B Cc: Theodore Ts'o Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ufs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f04ab232d08d..f3469ad0fef2 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -812,9 +812,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; - /* Keep 2Gig file limit. Some UFS variants need to override - this but as I don't know which I'll let those in the know loosen - the rules */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); From 1d74fc36f3eccb0a9c552d0dfcee12f87a456c98 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 24 May 2017 16:49:59 +1000 Subject: [PATCH 050/235] powerpc/spufs: Fix hash faults for kernel regions commit d75e4919cc0b6fbcbc8d6654ef66d87a9dbf1526 upstream. Commit ac29c64089b7 ("powerpc/mm: Replace _PAGE_USER with _PAGE_PRIVILEGED") swapped _PAGE_USER for _PAGE_PRIVILEGED, and introduced check_pte_access() which denied kernel access to non-_PAGE_PRIVILEGED pages. However, it didn't add _PAGE_PRIVILEGED to the hash fault handler for spufs' kernel accesses, so the DMAs required to establish SPE memory no longer work. This change adds _PAGE_PRIVILEGED to the hash fault handler for kernel accesses. Fixes: ac29c64089b7 ("powerpc/mm: Replace _PAGE_USER with _PAGE_PRIVILEGED") Signed-off-by: Jeremy Kerr Reported-by: Sombat Tragolgosol Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/cell/spu_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index e84d8fbc2e21..378c37aa6914 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -197,7 +197,9 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr) (REGION_ID(ea) != USER_REGION_ID)) { spin_unlock(&spu->register_lock); - ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr); + ret = hash_page(ea, + _PAGE_PRESENT | _PAGE_READ | _PAGE_PRIVILEGED, + 0x300, dsisr); spin_lock(&spu->register_lock); if (!ret) { From d3b2d9ca90c2a1e6119c3626f00e62fe23c50f2e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 18 May 2017 12:29:55 +0100 Subject: [PATCH 051/235] drivers/tty: 8250: only call fintek_8250_probe when doing port I/O commit 4c4fc90964b1cf205a67df566cc82ea1731bcb00 upstream. Commit fa01e2ca9f53 ("serial: 8250: Integrate Fintek into 8250_base") modified the probing logic for PNP0501 devices, to remove a collision between the generic 16550A driver and the Fintek driver, which reused the same ACPI _HID. The Fintek device probe is now incorporated into the common 8250 probe path, and gets called for all discovered 16550A compatible devices, including ones that are MMIO mapped rather than IO mapped. However, the Fintek driver assumes the port base is a I/O address, and proceeds to probe some arbitrary offsets above it. This is generally a wrong thing to do, but on ARM systems (having no native port I/O), this may result in faulting accesses of completely unrelated MMIO regions in the PCI I/O space. Given that this is at serial probe time, this results in hard to diagnose crashes at boot. So let's restrict the Fintek probe to devices that we know are using port I/O in the first place. Fixes: fa01e2ca9f53 ("serial: 8250: Integrate Fintek into 8250_base") Suggested-by: Arnd Bergmann Reviewed-by: Ricardo Ribalda Signed-off-by: Ard Biesheuvel Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 080d5a59d0a7..f24d3030b98c 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1320,7 +1320,7 @@ out_lock: /* * Check if the device is a Fintek F81216A */ - if (port->type == PORT_16550A) + if (port->type == PORT_16550A && port->iotype == UPIO_PORT) fintek_8250_probe(up); if (up->capabilities != old_capabilities) { From 63399974effb65d755a94edd10f33cd74e5a1b77 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Fri, 5 May 2017 11:06:50 +0200 Subject: [PATCH 052/235] i2c: i2c-tiny-usb: fix buffer not being DMA capable commit 5165da5923d6c7df6f2927b0113b2e4d9288661e upstream. Since v4.9 i2c-tiny-usb generates the below call trace and longer works, since it can't communicate with the USB device. The reason is, that since v4.9 the USB stack checks, that the buffer it should transfer is DMA capable. This was a requirement since v2.2 days, but it usually worked nevertheless. [ 17.504959] ------------[ cut here ]------------ [ 17.505488] WARNING: CPU: 0 PID: 93 at drivers/usb/core/hcd.c:1587 usb_hcd_map_urb_for_dma+0x37c/0x570 [ 17.506545] transfer buffer not dma capable [ 17.507022] Modules linked in: [ 17.507370] CPU: 0 PID: 93 Comm: i2cdetect Not tainted 4.11.0-rc8+ #10 [ 17.508103] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 17.509039] Call Trace: [ 17.509320] ? dump_stack+0x5c/0x78 [ 17.509714] ? __warn+0xbe/0xe0 [ 17.510073] ? warn_slowpath_fmt+0x5a/0x80 [ 17.510532] ? nommu_map_sg+0xb0/0xb0 [ 17.510949] ? usb_hcd_map_urb_for_dma+0x37c/0x570 [ 17.511482] ? usb_hcd_submit_urb+0x336/0xab0 [ 17.511976] ? wait_for_completion_timeout+0x12f/0x1a0 [ 17.512549] ? wait_for_completion_timeout+0x65/0x1a0 [ 17.513125] ? usb_start_wait_urb+0x65/0x160 [ 17.513604] ? usb_control_msg+0xdc/0x130 [ 17.514061] ? usb_xfer+0xa4/0x2a0 [ 17.514445] ? __i2c_transfer+0x108/0x3c0 [ 17.514899] ? i2c_transfer+0x57/0xb0 [ 17.515310] ? i2c_smbus_xfer_emulated+0x12f/0x590 [ 17.515851] ? _raw_spin_unlock_irqrestore+0x11/0x20 [ 17.516408] ? i2c_smbus_xfer+0x125/0x330 [ 17.516876] ? i2c_smbus_xfer+0x125/0x330 [ 17.517329] ? i2cdev_ioctl_smbus+0x1c1/0x2b0 [ 17.517824] ? i2cdev_ioctl+0x75/0x1c0 [ 17.518248] ? do_vfs_ioctl+0x9f/0x600 [ 17.518671] ? vfs_write+0x144/0x190 [ 17.519078] ? SyS_ioctl+0x74/0x80 [ 17.519463] ? entry_SYSCALL_64_fastpath+0x1e/0xad [ 17.519959] ---[ end trace d047c04982f5ac50 ]--- Signed-off-by: Sebastian Reichel Reviewed-by: Greg Kroah-Hartman Acked-by: Till Harbaum Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-tiny-usb.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-tiny-usb.c b/drivers/i2c/busses/i2c-tiny-usb.c index 0ed77eeff31e..a2e3dd715380 100644 --- a/drivers/i2c/busses/i2c-tiny-usb.c +++ b/drivers/i2c/busses/i2c-tiny-usb.c @@ -178,22 +178,39 @@ static int usb_read(struct i2c_adapter *adapter, int cmd, int value, int index, void *data, int len) { struct i2c_tiny_usb *dev = (struct i2c_tiny_usb *)adapter->algo_data; + void *dmadata = kmalloc(len, GFP_KERNEL); + int ret; + + if (!dmadata) + return -ENOMEM; /* do control transfer */ - return usb_control_msg(dev->usb_dev, usb_rcvctrlpipe(dev->usb_dev, 0), + ret = usb_control_msg(dev->usb_dev, usb_rcvctrlpipe(dev->usb_dev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_INTERFACE | - USB_DIR_IN, value, index, data, len, 2000); + USB_DIR_IN, value, index, dmadata, len, 2000); + + memcpy(data, dmadata, len); + kfree(dmadata); + return ret; } static int usb_write(struct i2c_adapter *adapter, int cmd, int value, int index, void *data, int len) { struct i2c_tiny_usb *dev = (struct i2c_tiny_usb *)adapter->algo_data; + void *dmadata = kmemdup(data, len, GFP_KERNEL); + int ret; + + if (!dmadata) + return -ENOMEM; /* do control transfer */ - return usb_control_msg(dev->usb_dev, usb_sndctrlpipe(dev->usb_dev, 0), + ret = usb_control_msg(dev->usb_dev, usb_sndctrlpipe(dev->usb_dev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_INTERFACE, - value, index, data, len, 2000); + value, index, dmadata, len, 2000); + + kfree(dmadata); + return ret; } static void i2c_tiny_usb_free(struct i2c_tiny_usb *dev) From 4472887cbd1373d7781bea9d8935f2d4968dd580 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 May 2017 03:48:23 +0800 Subject: [PATCH 053/235] crypto: skcipher - Add missing API setkey checks commit 9933e113c2e87a9f46a40fde8dafbf801dca1ab9 upstream. The API setkey checks for key sizes and alignment went AWOL during the skcipher conversion. This patch restores them. Fixes: 4e6c3df4d729 ("crypto: skcipher - Add low-level skcipher...") Reported-by: Baozeng Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/skcipher.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index f7d0018dcaee..93110d70c1d3 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -221,6 +221,44 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) return 0; } +static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + unsigned long alignmask = crypto_skcipher_alignmask(tfm); + struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); + u8 *buffer, *alignbuffer; + unsigned long absize; + int ret; + + absize = keylen + alignmask; + buffer = kmalloc(absize, GFP_ATOMIC); + if (!buffer) + return -ENOMEM; + + alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); + memcpy(alignbuffer, key, keylen); + ret = cipher->setkey(tfm, alignbuffer, keylen); + kzfree(buffer); + return ret; +} + +static int skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); + unsigned long alignmask = crypto_skcipher_alignmask(tfm); + + if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) { + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + if ((unsigned long)key & alignmask) + return skcipher_setkey_unaligned(tfm, key, keylen); + + return cipher->setkey(tfm, key, keylen); +} + static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); @@ -241,7 +279,7 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) tfm->__crt_alg->cra_type == &crypto_givcipher_type) return crypto_init_skcipher_ops_ablkcipher(tfm); - skcipher->setkey = alg->setkey; + skcipher->setkey = skcipher_setkey; skcipher->encrypt = alg->encrypt; skcipher->decrypt = alg->decrypt; skcipher->ivsize = alg->ivsize; From 68c83a379106115c4c2f33c6cba393d5f0a87a52 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 19 May 2017 11:39:09 +0200 Subject: [PATCH 054/235] x86/MCE: Export memory_error() commit 2d1f406139ec20320bf38bcd2461aa8e358084b5 upstream. Export the function which checks whether an MCE is a memory error to other users so that we can reuse the logic. Drop the boot_cpu_data use, while at it, as mce.cpuvendor already has the CPU vendor in there. Integrate a piece from a patch from Vishal Verma to export it for modules (nfit). The main reason we're exporting it is that the nfit handler nfit_handle_mce() needs to detect a memory error properly before doing its recovery actions. Signed-off-by: Borislav Petkov Cc: Tony Luck Cc: Vishal Verma Link: http://lkml.kernel.org/r/20170519093915.15413-2-bp@alien8.de Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 11 +++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9bd7ff5ffbcc..70c9cc3f098c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -257,6 +257,7 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif int mce_available(struct cpuinfo_x86 *c); +bool mce_is_memory_error(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 22cda29d654e..8ca5f8ad008e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -598,16 +598,14 @@ static void mce_read_aux(struct mce *m, int i) } } -static bool memory_error(struct mce *m) +bool mce_is_memory_error(struct mce *m) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { + if (m->cpuvendor == X86_VENDOR_AMD) { /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { + } else if (m->cpuvendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -628,6 +626,7 @@ static bool memory_error(struct mce *m) return false; } +EXPORT_SYMBOL_GPL(mce_is_memory_error); DEFINE_PER_CPU(unsigned, mce_poll_count); @@ -691,7 +690,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) + if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m)) if (m.status & MCI_STATUS_ADDRV) m.severity = severity; From 32d8077f1e9bd1c23c478648163d03bf475c55f4 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 19 May 2017 11:39:10 +0200 Subject: [PATCH 055/235] acpi, nfit: Fix the memory error check in nfit_handle_mce() commit fc08a4703a418a398bbb575ac311d36d110ac786 upstream. The check for an MCE being a memory error in the NFIT mce handler was bogus. Use the new mce_is_memory_error() helper to detect the error properly. Reported-by: Tony Luck Signed-off-by: Vishal Verma Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20170519093915.15413-3-bp@alien8.de Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/nfit/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index e5ce81c38eed..e25787afb212 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -26,7 +26,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, struct nfit_spa *nfit_spa; /* We only care about memory errors */ - if (!(mce->status & MCACOD)) + if (!mce_is_memory_error(mce)) return NOTIFY_DONE; /* From 8735cf2291cdfaddf517113bf92edb6eaf20a371 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 May 2017 18:12:40 +0200 Subject: [PATCH 056/235] Revert "ACPI / button: Change default behavior to lid_init_state=open" commit 878d8db039daac0938238e9a40a5bd6e50ee3c9b upstream. Revert commit 77e9a4aa9de1 (ACPI / button: Change default behavior to lid_init_state=open) which changed the kernel's behavior on laptops that boot with closed lids and expect the lid switch state to be reported accurately by the kernel. If you boot or resume your laptop with the lid closed on a docking station while using an external monitor connected to it, both internal and external displays will light on, while only the external should. There is a design choice in gdm to only provide the greeter on the internal display when lit on, so users only see a gray area on the external monitor. Also, the cursor will not show up as it's by default on the internal display too. To "fix" that, users have to open the laptop once and close it once again to sync the state of the switch with the hardware state. Even if the "method" operation mode implementation can be buggy on some platforms, the "open" choice is worse. It breaks docking stations basically and there is no way to have a user-space hwdb to fix that. On the contrary, it's rather easy in user-space to have a hwdb with the problematic platforms. Then, libinput (1.7.0+) can fix the state of the lid switch for us: you need to set the udev property LIBINPUT_ATTR_LID_SWITCH_RELIABILITY to 'write_open'. When libinput detects internal keyboard events, it will overwrite the state of the switch to open, making it reliable again. Given that logind only checks the lid switch value after a timeout, we can assume the user will use the internal keyboard before this timeout expires. For example, such a hwdb entry is: libinput:name:*Lid Switch*:dmi:*svnMicrosoftCorporation:pnSurface3:* LIBINPUT_ATTR_LID_SWITCH_RELIABILITY=write_open Link: https://bugzilla.gnome.org/show_bug.cgi?id=782380 Signed-off-by: Benjamin Tissoires Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/button.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 6d5a8c1d3132..e19f530f1083 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -113,7 +113,7 @@ struct acpi_button { static BLOCKING_NOTIFIER_HEAD(acpi_lid_notifier); static struct acpi_device *lid_device; -static u8 lid_init_state = ACPI_BUTTON_LID_INIT_OPEN; +static u8 lid_init_state = ACPI_BUTTON_LID_INIT_METHOD; static unsigned long lid_report_interval __read_mostly = 500; module_param(lid_report_interval, ulong, 0644); From ecbf0f48d65ecd1eecfe6c81af8b9c2589ab5641 Mon Sep 17 00:00:00 2001 From: Srinath Mannam Date: Thu, 18 May 2017 22:27:40 +0530 Subject: [PATCH 057/235] mmc: sdhci-iproc: suppress spurious interrupt with Multiblock read commit f5f968f2371ccdebb8a365487649673c9af68d09 upstream. The stingray SDHCI hardware supports ACMD12 and automatically issues after multi block transfer completed. If ACMD12 in SDHCI is disabled, spurious tx done interrupts are seen on multi block read command with below error message: Got data interrupt 0x00000002 even though no data operation was in progress. This patch uses SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12 to enable ACM12 support in SDHCI hardware and suppress spurious interrupt. Signed-off-by: Srinath Mannam Reviewed-by: Ray Jui Reviewed-by: Scott Branden Acked-by: Adrian Hunter Fixes: b580c52d58d9 ("mmc: sdhci-iproc: add IPROC SDHCI driver") Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-iproc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c index 726246665850..50dd6bd02951 100644 --- a/drivers/mmc/host/sdhci-iproc.c +++ b/drivers/mmc/host/sdhci-iproc.c @@ -157,7 +157,8 @@ static const struct sdhci_ops sdhci_iproc_ops = { }; static const struct sdhci_pltfm_data sdhci_iproc_pltfm_data = { - .quirks = SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK, + .quirks = SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | + SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12, .quirks2 = SDHCI_QUIRK2_ACMD23_BROKEN, .ops = &sdhci_iproc_ops, }; From 49d33fd10070902fa1cd4a0c5bfb4b41c072af5d Mon Sep 17 00:00:00 2001 From: Jiang Yi Date: Tue, 16 May 2017 17:57:55 +0800 Subject: [PATCH 058/235] iscsi-target: Always wait for kthread_should_stop() before kthread exit commit 5e0cf5e6c43b9e19fc0284f69e5cd2b4a47523b0 upstream. There are three timing problems in the kthread usages of iscsi_target_mod: - np_thread of struct iscsi_np - rx_thread and tx_thread of struct iscsi_conn In iscsit_close_connection(), it calls send_sig(SIGINT, conn->tx_thread, 1); kthread_stop(conn->tx_thread); In conn->tx_thread, which is iscsi_target_tx_thread(), when it receive SIGINT the kthread will exit without checking the return value of kthread_should_stop(). So if iscsi_target_tx_thread() exit right between send_sig(SIGINT...) and kthread_stop(...), the kthread_stop() will try to stop an already stopped kthread. This is invalid according to the documentation of kthread_stop(). (Fix -ECONNRESET logout handling in iscsi_target_tx_thread and early iscsi_target_rx_thread failure case - nab) Signed-off-by: Jiang Yi Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target.c | 30 ++++++++++++++++++----- drivers/target/iscsi/iscsi_target_erl0.c | 6 ++++- drivers/target/iscsi/iscsi_target_erl0.h | 2 +- drivers/target/iscsi/iscsi_target_login.c | 4 +++ 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 40e50f2d209d..01ea228358ea 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -3798,6 +3798,8 @@ int iscsi_target_tx_thread(void *arg) { int ret = 0; struct iscsi_conn *conn = arg; + bool conn_freed = false; + /* * Allow ourselves to be interrupted by SIGINT so that a * connection recovery / failure event can be triggered externally. @@ -3823,12 +3825,14 @@ get_immediate: goto transport_err; ret = iscsit_handle_response_queue(conn); - if (ret == 1) + if (ret == 1) { goto get_immediate; - else if (ret == -ECONNRESET) + } else if (ret == -ECONNRESET) { + conn_freed = true; goto out; - else if (ret < 0) + } else if (ret < 0) { goto transport_err; + } } transport_err: @@ -3838,8 +3842,13 @@ transport_err: * responsible for cleaning up the early connection failure. */ if (conn->conn_state != TARG_CONN_STATE_IN_LOGIN) - iscsit_take_action_for_connection_exit(conn); + iscsit_take_action_for_connection_exit(conn, &conn_freed); out: + if (!conn_freed) { + while (!kthread_should_stop()) { + msleep(100); + } + } return 0; } @@ -4012,6 +4021,7 @@ int iscsi_target_rx_thread(void *arg) { int rc; struct iscsi_conn *conn = arg; + bool conn_freed = false; /* * Allow ourselves to be interrupted by SIGINT so that a @@ -4024,7 +4034,7 @@ int iscsi_target_rx_thread(void *arg) */ rc = wait_for_completion_interruptible(&conn->rx_login_comp); if (rc < 0 || iscsi_target_check_conn_state(conn)) - return 0; + goto out; if (!conn->conn_transport->iscsit_get_rx_pdu) return 0; @@ -4033,7 +4043,15 @@ int iscsi_target_rx_thread(void *arg) if (!signal_pending(current)) atomic_set(&conn->transport_failed, 1); - iscsit_take_action_for_connection_exit(conn); + iscsit_take_action_for_connection_exit(conn, &conn_freed); + +out: + if (!conn_freed) { + while (!kthread_should_stop()) { + msleep(100); + } + } + return 0; } diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c index b54e72c7ab0f..efc453ef6831 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.c +++ b/drivers/target/iscsi/iscsi_target_erl0.c @@ -930,8 +930,10 @@ static void iscsit_handle_connection_cleanup(struct iscsi_conn *conn) } } -void iscsit_take_action_for_connection_exit(struct iscsi_conn *conn) +void iscsit_take_action_for_connection_exit(struct iscsi_conn *conn, bool *conn_freed) { + *conn_freed = false; + spin_lock_bh(&conn->state_lock); if (atomic_read(&conn->connection_exit)) { spin_unlock_bh(&conn->state_lock); @@ -942,6 +944,7 @@ void iscsit_take_action_for_connection_exit(struct iscsi_conn *conn) if (conn->conn_state == TARG_CONN_STATE_IN_LOGOUT) { spin_unlock_bh(&conn->state_lock); iscsit_close_connection(conn); + *conn_freed = true; return; } @@ -955,4 +958,5 @@ void iscsit_take_action_for_connection_exit(struct iscsi_conn *conn) spin_unlock_bh(&conn->state_lock); iscsit_handle_connection_cleanup(conn); + *conn_freed = true; } diff --git a/drivers/target/iscsi/iscsi_target_erl0.h b/drivers/target/iscsi/iscsi_target_erl0.h index a9e2f9497fb2..fbc1d84a63c3 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.h +++ b/drivers/target/iscsi/iscsi_target_erl0.h @@ -9,6 +9,6 @@ extern int iscsit_stop_time2retain_timer(struct iscsi_session *); extern void iscsit_connection_reinstatement_rcfr(struct iscsi_conn *); extern void iscsit_cause_connection_reinstatement(struct iscsi_conn *, int); extern void iscsit_fall_back_to_erl0(struct iscsi_session *); -extern void iscsit_take_action_for_connection_exit(struct iscsi_conn *); +extern void iscsit_take_action_for_connection_exit(struct iscsi_conn *, bool *); #endif /*** ISCSI_TARGET_ERL0_H ***/ diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 96c55bc10ac9..6128e8e80170 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -1460,5 +1460,9 @@ int iscsi_target_login_thread(void *arg) break; } + while (!kthread_should_stop()) { + msleep(100); + } + return 0; } From 80569d0e09add209e80c1354af8433daac506f35 Mon Sep 17 00:00:00 2001 From: "Bryant G. Ly" Date: Tue, 9 May 2017 11:50:26 -0500 Subject: [PATCH 059/235] ibmvscsis: Clear left-over abort_cmd pointers commit 98883f1b5415ea9dce60d5178877d15f4faa10b8 upstream. With the addition of ibmvscsis->abort_cmd pointer within commit 25e78531268e ("ibmvscsis: Do not send aborted task response"), make sure to explicitly NULL these pointers when clearing DELAY_SEND flag. Do this for two cases, when getting the new new ibmvscsis descriptor in ibmvscsis_get_free_cmd() and before posting the response completion in ibmvscsis_send_messages(). Signed-off-by: Bryant G. Ly Reviewed-by: Michael Cyr Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index 904422f5b62f..ad36e51e239c 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -1169,6 +1169,8 @@ static struct ibmvscsis_cmd *ibmvscsis_get_free_cmd(struct scsi_info *vscsi) cmd = list_first_entry_or_null(&vscsi->free_cmd, struct ibmvscsis_cmd, list); if (cmd) { + if (cmd->abort_cmd) + cmd->abort_cmd = NULL; cmd->flags &= ~(DELAY_SEND); list_del(&cmd->list); cmd->iue = iue; @@ -1773,6 +1775,7 @@ static void ibmvscsis_send_messages(struct scsi_info *vscsi) if (cmd->abort_cmd) { retry = true; cmd->abort_cmd->flags &= ~(DELAY_SEND); + cmd->abort_cmd = NULL; } /* From 69b1d90e6a0f848e84d49b5771acbfe97ec30150 Mon Sep 17 00:00:00 2001 From: "Bryant G. Ly" Date: Wed, 10 May 2017 14:35:47 -0500 Subject: [PATCH 060/235] ibmvscsis: Fix the incorrect req_lim_delta commit 75dbf2d36f6b122ad3c1070fe4bf95f71bbff321 upstream. The current code is not correctly calculating the req_lim_delta. We want to make sure vscsi->credit is always incremented when we do not send a response for the scsi op. Thus for the case where there is a successfully aborted task we need to make sure the vscsi->credit is incremented. v2 - Moves the original location of the vscsi->credit increment to a better spot. Since if we increment credit, the next command we send back will have increased req_lim_delta. But we probably shouldn't be doing that until the aborted cmd is actually released. Otherwise the client will think that it can send a new command, and we could find ourselves short of command elements. Not likely, but could happen. This patch depends on both: commit 25e78531268e ("ibmvscsis: Do not send aborted task response") commit 98883f1b5415 ("ibmvscsis: Clear left-over abort_cmd pointers") Signed-off-by: Bryant G. Ly Reviewed-by: Michael Cyr Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index ad36e51e239c..04148438d7ec 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -1790,6 +1790,25 @@ static void ibmvscsis_send_messages(struct scsi_info *vscsi) list_del(&cmd->list); ibmvscsis_free_cmd_resources(vscsi, cmd); + /* + * With a successfully aborted op + * through LIO we want to increment the + * the vscsi credit so that when we dont + * send a rsp to the original scsi abort + * op (h_send_crq), but the tm rsp to + * the abort is sent, the credit is + * correctly sent with the abort tm rsp. + * We would need 1 for the abort tm rsp + * and 1 credit for the aborted scsi op. + * Thus we need to increment here. + * Also we want to increment the credit + * here because we want to make sure + * cmd is actually released first + * otherwise the client will think it + * it can send a new cmd, and we could + * find ourselves short of cmd elements. + */ + vscsi->credit += 1; } else { iue = cmd->iue; @@ -2964,10 +2983,7 @@ static long srp_build_response(struct scsi_info *vscsi, rsp->opcode = SRP_RSP; - if (vscsi->credit > 0 && vscsi->state == SRP_PROCESSING) - rsp->req_lim_delta = cpu_to_be32(vscsi->credit); - else - rsp->req_lim_delta = cpu_to_be32(1 + vscsi->credit); + rsp->req_lim_delta = cpu_to_be32(1 + vscsi->credit); rsp->tag = cmd->rsp.tag; rsp->flags = 0; From 34808d76dd779ba78aaed8a87c1363106d378aeb Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Tue, 25 Apr 2017 11:29:56 -0700 Subject: [PATCH 061/235] HID: wacom: Have wacom_tpc_irq guard against possible NULL dereference commit 2ac97f0f6654da14312d125005c77a6010e0ea38 upstream. The following Smatch complaint was generated in response to commit 2a6cdbd ("HID: wacom: Introduce new 'touch_input' device"): drivers/hid/wacom_wac.c:1586 wacom_tpc_irq() error: we previously assumed 'wacom->touch_input' could be null (see line 1577) The 'touch_input' and 'pen_input' variables point to the 'struct input_dev' used for relaying touch and pen events to userspace, respectively. If a device does not have a touch interface or pen interface, the associated input variable is NULL. The 'wacom_tpc_irq()' function is responsible for forwarding input reports to a more-specific IRQ handler function. An unknown report could theoretically be mistaken as e.g. a touch report on a device which does not have a touch interface. This can be prevented by only calling the pen/touch functions are called when the pen/touch pointers are valid. Fixes: 2a6cdbd ("HID: wacom: Introduce new 'touch_input' device") Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom_wac.c | 47 +++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 0e07a769df7c..c6a922ee5d3b 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1400,37 +1400,38 @@ static int wacom_tpc_irq(struct wacom_wac *wacom, size_t len) { unsigned char *data = wacom->data; - if (wacom->pen_input) + if (wacom->pen_input) { dev_dbg(wacom->pen_input->dev.parent, "%s: received report #%d\n", __func__, data[0]); - else if (wacom->touch_input) + + if (len == WACOM_PKGLEN_PENABLED || + data[0] == WACOM_REPORT_PENABLED) + return wacom_tpc_pen(wacom); + } + else if (wacom->touch_input) { dev_dbg(wacom->touch_input->dev.parent, "%s: received report #%d\n", __func__, data[0]); - switch (len) { - case WACOM_PKGLEN_TPC1FG: - return wacom_tpc_single_touch(wacom, len); - - case WACOM_PKGLEN_TPC2FG: - return wacom_tpc_mt_touch(wacom); - - case WACOM_PKGLEN_PENABLED: - return wacom_tpc_pen(wacom); - - default: - switch (data[0]) { - case WACOM_REPORT_TPC1FG: - case WACOM_REPORT_TPCHID: - case WACOM_REPORT_TPCST: - case WACOM_REPORT_TPC1FGE: + switch (len) { + case WACOM_PKGLEN_TPC1FG: return wacom_tpc_single_touch(wacom, len); - case WACOM_REPORT_TPCMT: - case WACOM_REPORT_TPCMT2: - return wacom_mt_touch(wacom); + case WACOM_PKGLEN_TPC2FG: + return wacom_tpc_mt_touch(wacom); - case WACOM_REPORT_PENABLED: - return wacom_tpc_pen(wacom); + default: + switch (data[0]) { + case WACOM_REPORT_TPC1FG: + case WACOM_REPORT_TPCHID: + case WACOM_REPORT_TPCST: + case WACOM_REPORT_TPC1FGE: + return wacom_tpc_single_touch(wacom, len); + + case WACOM_REPORT_TPCMT: + case WACOM_REPORT_TPCMT2: + return wacom_mt_touch(wacom); + + } } } From ae057808924227e244492444b8f6e6ea6fc9544d Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Mon, 10 Apr 2017 17:12:34 +0200 Subject: [PATCH 062/235] nvme-rdma: support devices with queue size < 32 commit 0544f5494a03b8846db74e02be5685d1f32b06c9 upstream. In the case of small NVMe-oF queue size (<32) we may enter a deadlock caused by the fact that the IB completions aren't sent waiting for 32 and the send queue will fill up. The error is seen as (using mlx5): [ 2048.693355] mlx5_0:mlx5_ib_post_send:3765:(pid 7273): [ 2048.693360] nvme nvme1: nvme_rdma_post_send failed with error code -12 This patch changes the way the signaling is done so that it depends on the queue depth now. The magic define has been removed completely. Signed-off-by: Marta Rybczynska Signed-off-by: Samuel Jones Acked-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/rdma.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3d25add36d91..3222f3e987eb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1011,6 +1011,19 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) nvme_rdma_wr_error(cq, wc, "SEND"); } +static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) +{ + int sig_limit; + + /* + * We signal completion every queue depth/2 and also handle the + * degenerated case of a device with queue_depth=1, where we + * would need to signal every message. + */ + sig_limit = max(queue->queue_size / 2, 1); + return (++queue->sig_count % sig_limit) == 0; +} + static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, struct ib_send_wr *first, bool flush) @@ -1038,9 +1051,6 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * Would have been way to obvious to handle this in hardware or * at least the RDMA stack.. * - * This messy and racy code sniplet is copy and pasted from the iSER - * initiator, and the magic '32' comes from there as well. - * * Always signal the flushes. The magic request used for the flush * sequencer is not allocated in our driver's tagset and it's * triggered to be freed by blk_cleanup_queue(). So we need to @@ -1048,7 +1058,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * embeded in request's payload, is not freed when __ib_process_cq() * calls wr_cqe->done(). */ - if ((++queue->sig_count % 32) == 0 || flush) + if (nvme_rdma_queue_sig_limit(queue) || flush) wr.send_flags |= IB_SEND_SIGNALED; if (first) From 510b0ec7f60fd762971286b3246cdd9c37aa41f8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 22 May 2017 23:05:03 +0800 Subject: [PATCH 063/235] nvme: use blk_mq_start_hw_queues() in nvme_kill_queues() commit 806f026f9b901eaf1a6baeb48b5da18d6a4f818e upstream. Inside nvme_kill_queues(), we have to start hw queues for draining requests in sw queues, .dispatch list and requeue list, so use blk_mq_start_hw_queues() instead of blk_mq_start_stopped_hw_queues() which only run queues if queues are stopped, but the queues may have been started already, for example nvme_start_queues() is called in reset work function. blk_mq_start_hw_queues() run hw queues in current context, instead of running asynchronously like before. Given nvme_kill_queues() is run from either remove context or reset worker context, both are fine to run hw queue directly. And the mutex of namespaces_mutex isn't a problem too becasue nvme_start_freeze() runs hw queue in this way already. Reported-by: Zhang Yi Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5f2feeef8905..593a8fd72b34 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2049,7 +2049,13 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); blk_mq_abort_requeue_list(ns->queue); - blk_mq_start_stopped_hw_queues(ns->queue, true); + + /* + * Forcibly start all queues to avoid having stuck requests. + * Note that we must ensure the queues are not stopped + * when the final removal happens. + */ + blk_mq_start_hw_queues(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } From 21f33b157721172595eb06c711cbf6a9f1a155fd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 22 May 2017 23:05:04 +0800 Subject: [PATCH 064/235] nvme: avoid to use blk_mq_abort_requeue_list() commit 986f75c876dbafed98eba7cb516c5118f155db23 upstream. NVMe may add request into requeue list simply and not kick off the requeue if hw queues are stopped. Then blk_mq_abort_requeue_list() is called in both nvme_kill_queues() and nvme_ns_remove() for dealing with this issue. Unfortunately blk_mq_abort_requeue_list() is absolutely a race maker, for example, one request may be requeued during the aborting. So this patch just calls blk_mq_kick_requeue_list() in nvme_kill_queues() to handle this issue like what nvme_start_queues() does. Now all requests in requeue list when queues are stopped will be handled by blk_mq_kick_requeue_list() when queues are restarted, either in nvme_start_queues() or in nvme_kill_queues(). Reported-by: Zhang Yi Reviewed-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 593a8fd72b34..fbeca065f18c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1725,7 +1725,6 @@ static void nvme_ns_remove(struct nvme_ns *ns) sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_attr_group); del_gendisk(ns->disk); - blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } @@ -2048,7 +2047,6 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) continue; revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); - blk_mq_abort_requeue_list(ns->queue); /* * Forcibly start all queues to avoid having stuck requests. @@ -2056,6 +2054,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) * when the final removal happens. */ blk_mq_start_hw_queues(ns->queue); + + /* draining requests in requeue list */ + blk_mq_kick_requeue_list(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } From 9869fb485cc64a2be3dde7952aef10809a631497 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 26 Jan 2017 16:37:01 -0200 Subject: [PATCH 065/235] scsi: mpt3sas: Force request partial completion alignment commit f2e767bb5d6ee0d988cb7d4e54b0b21175802b6b upstream. The firmware or device, possibly under a heavy I/O load, can return on a partial unaligned boundary. Scsi-ml expects these requests to be completed on an alignment boundary. Scsi-ml blindly requeues the I/O without checking the alignment boundary of the I/O request for the remaining bytes. This leads to errors, since devices cannot perform non-aligned read/write operations. This patch fixes the issue in the driver. It aligns unaligned completions of FS requests, by truncating them to the nearest alignment boundary. [mkp: simplified if statement] Reported-by: Mauricio Faria De Oliveira Signed-off-by: Guilherme G. Piccoli Signed-off-by: Ram Pai Acked-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 8a7941b8189f..289374cbcb47 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -4634,6 +4634,7 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply) struct MPT3SAS_DEVICE *sas_device_priv_data; u32 response_code = 0; unsigned long flags; + unsigned int sector_sz; mpi_reply = mpt3sas_base_get_reply_virt_addr(ioc, reply); scmd = _scsih_scsi_lookup_get_clear(ioc, smid); @@ -4692,6 +4693,20 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply) } xfer_cnt = le32_to_cpu(mpi_reply->TransferCount); + + /* In case of bogus fw or device, we could end up having + * unaligned partial completion. We can force alignment here, + * then scsi-ml does not need to handle this misbehavior. + */ + sector_sz = scmd->device->sector_size; + if (unlikely(scmd->request->cmd_type == REQ_TYPE_FS && sector_sz && + xfer_cnt % sector_sz)) { + sdev_printk(KERN_INFO, scmd->device, + "unaligned partial completion avoided (xfer_cnt=%u, sector_sz=%u)\n", + xfer_cnt, sector_sz); + xfer_cnt = round_down(xfer_cnt, sector_sz); + } + scsi_set_resid(scmd, scsi_bufflen(scmd) - xfer_cnt); if (ioc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE) log_info = le32_to_cpu(mpi_reply->IOCLogInfo); From c8d25fcb5980fa7305215c870045dd717b54bab4 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:14:14 -0400 Subject: [PATCH 066/235] drm/radeon/ci: disable mclk switching for high refresh rates (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 58d7e3e427db1bd68f33025519a9468140280a75 upstream. Even if the vblank period would allow it, it still seems to be problematic on some cards. v2: fix logic inversion (Nils) bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Acked-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/ci_dpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/radeon/ci_dpm.c b/drivers/gpu/drm/radeon/ci_dpm.c index 7ba450832e6b..ea36dc4dd5d2 100644 --- a/drivers/gpu/drm/radeon/ci_dpm.c +++ b/drivers/gpu/drm/radeon/ci_dpm.c @@ -776,6 +776,12 @@ bool ci_dpm_vblank_too_short(struct radeon_device *rdev) u32 vblank_time = r600_dpm_get_vblank_time(rdev); u32 switch_limit = pi->mem_gddr5 ? 450 : 300; + /* disable mclk switching if the refresh is >120Hz, even if the + * blanking period would allow it + */ + if (r600_dpm_get_vrefresh(rdev) > 120) + return true; + if (vblank_time < switch_limit) return true; else From acc771fdaec7008a3b68db2a07e6d5901ea021fe Mon Sep 17 00:00:00 2001 From: Lyude Date: Thu, 11 May 2017 19:31:12 -0400 Subject: [PATCH 067/235] drm/radeon: Unbreak HPD handling for r600+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3d18e33735a02b1a90aecf14410bf3edbfd4d3dc upstream. We end up reading the interrupt register for HPD5, and then writing it to HPD6 which on systems without anything using HPD5 results in permanently disabling hotplug on one of the display outputs after the first time we acknowledge a hotplug interrupt from the GPU. This code is really bad. But for now, let's just fix this. I will hopefully have a large patch series to refactor all of this soon. Reviewed-by: Christian König Signed-off-by: Lyude Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/cik.c | 4 ++-- drivers/gpu/drm/radeon/evergreen.c | 4 ++-- drivers/gpu/drm/radeon/r600.c | 2 +- drivers/gpu/drm/radeon/si.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index f6ff41a0eed6..edee6a5f4da9 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -7416,7 +7416,7 @@ static inline void cik_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.cik.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -7446,7 +7446,7 @@ static inline void cik_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.cik.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index 0b6b5766216f..6068b8a01016 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -4933,7 +4933,7 @@ static void evergreen_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -4964,7 +4964,7 @@ static void evergreen_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index a951881c2a50..f2eac6b6c46a 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -3995,7 +3995,7 @@ static void r600_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.r600.disp_int_cont2 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 877af4a5ef68..3333e8a45933 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -6330,7 +6330,7 @@ static inline void si_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } @@ -6361,7 +6361,7 @@ static inline void si_irq_ack(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, tmp); } if (rdev->irq.stat_regs.evergreen.disp_int_cont5 & DC_HPD6_RX_INTERRUPT) { - tmp = RREG32(DC_HPD5_INT_CONTROL); + tmp = RREG32(DC_HPD6_INT_CONTROL); tmp |= DC_HPDx_RX_INT_ACK; WREG32(DC_HPD6_INT_CONTROL, tmp); } From ebd4c110fd0b4247277e8f351aa2d6694a0b5d94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Mon, 30 Jan 2017 12:06:35 +0900 Subject: [PATCH 068/235] drm/radeon: Fix vram_size/visible values in DRM_RADEON_GEM_INFO ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 51964e9e12d0a054002a1a0d1dec4f661c7aaf28 upstream. vram_size is supposed to be the total amount of VRAM that can be used by userspace, which corresponds to the TTM VRAM manager size (which is normally the full amount of VRAM, but can be just the visible VRAM when DMA can't be used for BO migration for some reason). The above was incorrectly used for vram_visible before, resulting in generally too large values being reported. Reviewed-by: Christian König Reviewed-by: Nicolai Hähnle Reviewed-by: Alex Deucher Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_drv.c | 3 ++- drivers/gpu/drm/radeon/radeon_gem.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index e0c143b865f3..30bd4a6a9d46 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -97,9 +97,10 @@ * 2.46.0 - Add PFP_SYNC_ME support on evergreen * 2.47.0 - Add UVD_NO_OP register support * 2.48.0 - TA_CS_BC_BASE_ADDR allowed on SI + * 2.49.0 - DRM_RADEON_GEM_INFO ioctl returns correct vram_size/visible values */ #define KMS_DRIVER_MAJOR 2 -#define KMS_DRIVER_MINOR 48 +#define KMS_DRIVER_MINOR 49 #define KMS_DRIVER_PATCHLEVEL 0 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags); int radeon_driver_unload_kms(struct drm_device *dev); diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index deb9511725c9..316856715878 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -220,8 +220,8 @@ int radeon_gem_info_ioctl(struct drm_device *dev, void *data, man = &rdev->mman.bdev.man[TTM_PL_VRAM]; - args->vram_size = rdev->mc.real_vram_size; - args->vram_visible = (u64)man->size << PAGE_SHIFT; + args->vram_size = (u64)man->size << PAGE_SHIFT; + args->vram_visible = rdev->mc.visible_vram_size; args->vram_visible -= rdev->vram_pin_size; args->gart_size = rdev->mc.gtt_size; args->gart_size -= rdev->gart_pin_size; From da856d05645c8d4ca74a6465a98126ec69815266 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Fri, 2 Jun 2017 14:46:28 -0700 Subject: [PATCH 069/235] pcmcia: remove left-over %Z format commit ff5a20169b98d84ad8d7f99f27c5ebbb008204d6 upstream. Commit 5b5e0928f742 ("lib/vsprintf.c: remove %Z support") removed some usages of format %Z but forgot "%.2Zx". This makes clang 4.0 reports a -Wformat-extra-args warning because it does not know about %Z. Replace %Z with %z. Link: http://lkml.kernel.org/r/20170520090946.22562-1-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Cc: Harald Welte Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/char/pcmcia/cm4040_cs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c index fc061f7c2bd1..a7de8ae185a5 100644 --- a/drivers/char/pcmcia/cm4040_cs.c +++ b/drivers/char/pcmcia/cm4040_cs.c @@ -374,7 +374,7 @@ static ssize_t cm4040_write(struct file *filp, const char __user *buf, rc = write_sync_reg(SCR_HOST_TO_READER_START, dev); if (rc <= 0) { - DEBUGP(5, dev, "write_sync_reg c=%.2Zx\n", rc); + DEBUGP(5, dev, "write_sync_reg c=%.2zx\n", rc); DEBUGP(2, dev, "<- cm4040_write (failed)\n"); if (rc == -ERESTARTSYS) return rc; @@ -387,7 +387,7 @@ static ssize_t cm4040_write(struct file *filp, const char __user *buf, for (i = 0; i < bytes_to_write; i++) { rc = wait_for_bulk_out_ready(dev); if (rc <= 0) { - DEBUGP(5, dev, "wait_for_bulk_out_ready rc=%.2Zx\n", + DEBUGP(5, dev, "wait_for_bulk_out_ready rc=%.2zx\n", rc); DEBUGP(2, dev, "<- cm4040_write (failed)\n"); if (rc == -ERESTARTSYS) @@ -403,7 +403,7 @@ static ssize_t cm4040_write(struct file *filp, const char __user *buf, rc = write_sync_reg(SCR_HOST_TO_READER_DONE, dev); if (rc <= 0) { - DEBUGP(5, dev, "write_sync_reg c=%.2Zx\n", rc); + DEBUGP(5, dev, "write_sync_reg c=%.2zx\n", rc); DEBUGP(2, dev, "<- cm4040_write (failed)\n"); if (rc == -ERESTARTSYS) return rc; From 7d8ef0e0bc1e25cc23f6012aacafe3b28e8c01f7 Mon Sep 17 00:00:00 2001 From: Alexander Tsoy Date: Mon, 22 May 2017 20:58:11 +0300 Subject: [PATCH 070/235] ALSA: hda - apply STAC_9200_DELL_M22 quirk for Dell Latitude D430 commit 1fc2e41f7af4572b07190f9dec28396b418e9a36 upstream. This model is actually called 92XXM2-8 in Windows driver. But since pin configs for M22 and M28 are identical, just reuse M22 quirk. Fixes external microphone (tested) and probably docking station ports (not tested). Signed-off-by: Alexander Tsoy Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_sigmatel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 37b70f8e878f..0abab7926dca 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1537,6 +1537,8 @@ static const struct snd_pci_quirk stac9200_fixup_tbl[] = { "Dell Inspiron 1501", STAC_9200_DELL_M26), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x01f6, "unknown Dell", STAC_9200_DELL_M26), + SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0201, + "Dell Latitude D430", STAC_9200_DELL_M22), /* Panasonic */ SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-74", STAC_9200_PANASONIC), /* Gateway machines needs EAPD to be set on resume */ From d494cab70697413fe1a1b6ae57e4ba18a5e43e36 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Fri, 2 Jun 2017 14:46:40 -0700 Subject: [PATCH 071/235] mm/migrate: fix refcount handling when !hugepage_migration_supported() commit 30809f559a0d348c2dfd7ab05e9a451e2384962e upstream. On failing to migrate a page, soft_offline_huge_page() performs the necessary update to the hugepage ref-count. But when !hugepage_migration_supported() , unmap_and_move_hugepage() also decrements the page ref-count for the hugepage. The combined behaviour leaves the ref-count in an inconsistent state. This leads to soft lockups when running the overcommitted hugepage test from mce-tests suite. Soft offlining pfn 0x83ed600 at process virtual address 0x400000000000 soft offline: 0x83ed600: migration failed 1, type 1fffc00000008008 (uptodate|head) INFO: rcu_preempt detected stalls on CPUs/tasks: Tasks blocked on level-0 rcu_node (CPUs 0-7): P2715 (detected by 7, t=5254 jiffies, g=963, c=962, q=321) thugetlb_overco R running task 0 2715 2685 0x00000008 Call trace: dump_backtrace+0x0/0x268 show_stack+0x24/0x30 sched_show_task+0x134/0x180 rcu_print_detail_task_stall_rnp+0x54/0x7c rcu_check_callbacks+0xa74/0xb08 update_process_times+0x34/0x60 tick_sched_handle.isra.7+0x38/0x70 tick_sched_timer+0x4c/0x98 __hrtimer_run_queues+0xc0/0x300 hrtimer_interrupt+0xac/0x228 arch_timer_handler_phys+0x3c/0x50 handle_percpu_devid_irq+0x8c/0x290 generic_handle_irq+0x34/0x50 __handle_domain_irq+0x68/0xc0 gic_handle_irq+0x5c/0xb0 Address this by changing the putback_active_hugepage() in soft_offline_huge_page() to putback_movable_pages(). This only triggers on systems that enable memory failure handling (ARCH_SUPPORTS_MEMORY_FAILURE) but not hugepage migration (!ARCH_ENABLE_HUGEPAGE_MIGRATION). I imagine this wasn't triggered as there aren't many systems running this configuration. [akpm@linux-foundation.org: remove dead comment, per Naoya] Link: http://lkml.kernel.org/r/20170525135146.32011-1-punit.agrawal@arm.com Reported-by: Manoj Iyer Tested-by: Manoj Iyer Suggested-by: Naoya Horiguchi Signed-off-by: Punit Agrawal Cc: Joonsoo Kim Cc: Wanpeng Li Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memory-failure.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 19e796d36a62..4bd44803e366 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1587,12 +1587,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); - /* - * We know that soft_offline_huge_page() tries to migrate - * only one hugepage pointed to by hpage, so we need not - * run through the pagelist here. - */ - putback_active_hugepage(hpage); + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); if (ret > 0) ret = -EIO; } else { From 1163e785b1506a4f46dbdee89bbab161dd742186 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 2 Jun 2017 14:46:43 -0700 Subject: [PATCH 072/235] mlock: fix mlock count can not decrease in race condition commit 70feee0e1ef331b22cc51f383d532a0d043fbdcc upstream. Kefeng reported that when running the follow test, the mlock count in meminfo will increase permanently: [1] testcase linux:~ # cat test_mlockal grep Mlocked /proc/meminfo for j in `seq 0 10` do for i in `seq 4 15` do ./p_mlockall >> log & done sleep 0.2 done # wait some time to let mlock counter decrease and 5s may not enough sleep 5 grep Mlocked /proc/meminfo linux:~ # cat p_mlockall.c #include #include #include #define SPACE_LEN 4096 int main(int argc, char ** argv) { int ret; void *adr = malloc(SPACE_LEN); if (!adr) return -1; ret = mlockall(MCL_CURRENT | MCL_FUTURE); printf("mlcokall ret = %d\n", ret); ret = munlockall(); printf("munlcokall ret = %d\n", ret); free(adr); return 0; } In __munlock_pagevec() we should decrement NR_MLOCK for each page where we clear the PageMlocked flag. Commit 1ebb7cc6a583 ("mm: munlock: batch NR_MLOCK zone state updates") has introduced a bug where we don't decrement NR_MLOCK for pages where we clear the flag, but fail to isolate them from the lru list (e.g. when the pages are on some other cpu's percpu pagevec). Since PageMlocked stays cleared, the NR_MLOCK accounting gets permanently disrupted by this. Fix it by counting the number of page whose PageMlock flag is cleared. Fixes: 1ebb7cc6a583 (" mm: munlock: batch NR_MLOCK zone state updates") Link: http://lkml.kernel.org/r/1495678405-54569-1-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Reported-by: Kefeng Wang Tested-by: Kefeng Wang Cc: Vlastimil Babka Cc: Joern Engel Cc: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Xishi Qiu Cc: zhongjiang Cc: Hanjun Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/mlock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 665ab75b5533..f0505692a5f4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -285,7 +285,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) { int i; int nr = pagevec_count(pvec); - int delta_munlocked; + int delta_munlocked = -nr; struct pagevec pvec_putback; int pgrescued = 0; @@ -305,6 +305,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) continue; else __munlock_isolation_failed(page); + } else { + delta_munlocked++; } /* @@ -316,7 +318,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); spin_unlock_irq(zone_lru_lock(zone)); From 292f70cd9649170243fe29331654f8c5f0c8d5d6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 2 Jun 2017 14:46:49 -0700 Subject: [PATCH 073/235] mm: consider memblock reservations for deferred memory initialization sizing commit 864b9a393dcb5aed09b8fd31b9bbda0fdda99374 upstream. We have seen an early OOM killer invocation on ppc64 systems with crashkernel=4096M: kthreadd invoked oom-killer: gfp_mask=0x16040c0(GFP_KERNEL|__GFP_COMP|__GFP_NOTRACK), nodemask=7, order=0, oom_score_adj=0 kthreadd cpuset=/ mems_allowed=7 CPU: 0 PID: 2 Comm: kthreadd Not tainted 4.4.68-1.gd7fe927-default #1 Call Trace: dump_stack+0xb0/0xf0 (unreliable) dump_header+0xb0/0x258 out_of_memory+0x5f0/0x640 __alloc_pages_nodemask+0xa8c/0xc80 kmem_getpages+0x84/0x1a0 fallback_alloc+0x2a4/0x320 kmem_cache_alloc_node+0xc0/0x2e0 copy_process.isra.25+0x260/0x1b30 _do_fork+0x94/0x470 kernel_thread+0x48/0x60 kthreadd+0x264/0x330 ret_from_kernel_thread+0x5c/0xa4 Mem-Info: active_anon:0 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:0 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 slab_reclaimable:5 slab_unreclaimable:73 mapped:0 shmem:0 pagetables:0 bounce:0 free:0 free_pcp:0 free_cma:0 Node 7 DMA free:0kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:52428800kB managed:110016kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:320kB slab_unreclaimable:4672kB kernel_stack:1152kB pagetables:0kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes lowmem_reserve[]: 0 0 0 0 Node 7 DMA: 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB 0*8192kB 0*16384kB = 0kB 0 total pagecache pages 0 pages in swap cache Swap cache stats: add 0, delete 0, find 0/0 Free swap = 0kB Total swap = 0kB 819200 pages RAM 0 pages HighMem/MovableOnly 817481 pages reserved 0 pages cma reserved 0 pages hwpoisoned the reason is that the managed memory is too low (only 110MB) while the rest of the the 50GB is still waiting for the deferred intialization to be done. update_defer_init estimates the initial memoty to initialize to 2GB at least but it doesn't consider any memory allocated in that range. In this particular case we've had Reserving 4096MB of memory at 128MB for crashkernel (System RAM: 51200MB) so the low 2GB is mostly depleted. Fix this by considering memblock allocations in the initial static initialization estimation. Move the max_initialise to reset_deferred_meminit and implement a simple memblock_reserved_memory helper which iterates all reserved blocks and sums the size of all that start below the given address. The cumulative size is than added on top of the initial estimation. This is still not ideal because reset_deferred_meminit doesn't consider holes and so reservation might be above the initial estimation whihch we ignore but let's make the logic simpler until we really need to handle more complicated cases. Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set") Link: http://lkml.kernel.org/r/20170531104010.GI27783@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Mel Gorman Tested-by: Srikar Dronamraju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/memblock.h | 8 ++++++++ include/linux/mmzone.h | 1 + mm/memblock.c | 23 +++++++++++++++++++++++ mm/page_alloc.c | 33 ++++++++++++++++++++++----------- 4 files changed, 54 insertions(+), 11 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5b759c9acf97..e8fba68e5d03 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -421,12 +421,20 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end) } #endif +extern unsigned long memblock_reserved_memory_within(phys_addr_t start_addr, + phys_addr_t end_addr); #else static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) { return 0; } +static inline unsigned long memblock_reserved_memory_within(phys_addr_t start_addr, + phys_addr_t end_addr) +{ + return 0; +} + #endif /* CONFIG_HAVE_MEMBLOCK */ #endif /* __KERNEL__ */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e273e243a13..6744eb40c4ea 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -672,6 +672,7 @@ typedef struct pglist_data { * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; + unsigned long static_init_size; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/memblock.c b/mm/memblock.c index 7608bc305936..68849d0ead09 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1696,6 +1696,29 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name } } +extern unsigned long __init_memblock +memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) +{ + struct memblock_region *rgn; + unsigned long size = 0; + int idx; + + for_each_memblock_type((&memblock.reserved), rgn) { + phys_addr_t start, end; + + if (rgn->base + rgn->size < start_addr) + continue; + if (rgn->base > end_addr) + continue; + + start = rgn->base; + end = start + rgn->size; + size += end - start; + } + + return size; +} + void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b06fb385dd7..56df8c24689d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -286,6 +286,26 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void reset_deferred_meminit(pg_data_t *pgdat) { + unsigned long max_initialise; + unsigned long reserved_lowmem; + + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); + + /* + * Compensate the all the memblock reservations (e.g. crash kernel) + * from the initial estimation to make sure we will initialize enough + * memory to boot. + */ + reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, + pgdat->node_start_pfn + max_initialise); + max_initialise += reserved_lowmem; + + pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -308,20 +328,11 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - unsigned long max_initialise; - /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - (*nr_initialised)++; - if ((*nr_initialised > max_initialise) && + if ((*nr_initialised > pgdat->static_init_size) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -5911,7 +5922,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); - reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; @@ -5933,6 +5943,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif + reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } From d1cff22220718d2fd2a694271a11727f30413658 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 12 May 2017 09:02:00 -0700 Subject: [PATCH 074/235] RDMA/qib,hfi1: Fix MR reference count leak on write with immediate commit 1feb40067cf04ae48d65f728d62ca255c9449178 upstream. The handling of IB_RDMA_WRITE_ONLY_WITH_IMMEDIATE will leak a memory reference when a buffer cannot be allocated for returning the immediate data. The issue is that the rkey validation has already occurred and the RNR nak fails to release the reference that was fruitlessly gotten. The the peer will send the identical single packet request when its RNR timer pops. The fix is to release the held reference prior to the rnr nak exit. This is the only sequence the requires both rkey validation and the buffer allocation on the same packet. Tested-by: Tadeusz Struk Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/hfi1/rc.c | 5 ++++- drivers/infiniband/hw/qib/qib_rc.c | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 83198a8a8797..4bd5b5caa243 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2366,8 +2366,11 @@ send_last: ret = hfi1_rvt_get_rwqe(qp, 1); if (ret < 0) goto nack_op_err; - if (!ret) + if (!ret) { + /* peer will send again */ + rvt_put_ss(&qp->r_sge); goto rnr_nak; + } wc.ex.imm_data = ohdr->u.rc.imm_data; wc.wc_flags = IB_WC_WITH_IMM; goto send_last; diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 2097512e75aa..f3fe787c9426 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -2067,8 +2067,10 @@ send_last: ret = qib_get_rwqe(qp, 1); if (ret < 0) goto nack_op_err; - if (!ret) + if (!ret) { + rvt_put_ss(&qp->r_sge); goto rnr_nak; + } wc.ex.imm_data = ohdr->u.rc.imm_data; hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; From d5ecb4ca0da7528c0f1ebc676d831b00efd37efa Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Sat, 20 May 2017 15:03:29 -0500 Subject: [PATCH 075/235] x86/boot: Use CROSS_COMPILE prefix for readelf commit 3780578761921f094179c6289072a74b2228c602 upstream. The boot code Makefile contains a straight 'readelf' invocation. This causes build warnings in cross compile environments, when there is no unprefixed readelf accessible via $PATH. Add the missing $(CROSS_COMPILE) prefix. [ tglx: Rewrote changelog ] Fixes: 98f78525371b ("x86/boot: Refuse to build with data relocations") Signed-off-by: Rob Landley Acked-by: Kees Cook Cc: Jiri Kosina Cc: Paul Bolle Cc: "H.J. Lu" Link: http://lkml.kernel.org/r/ced18878-693a-9576-a024-113ef39a22c0@landley.net Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 34d9e15857c3..4669b3a931ed 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -94,7 +94,7 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o quiet_cmd_check_data_rel = DATAREL $@ define cmd_check_data_rel for obj in $(filter %.o,$^); do \ - readelf -S $$obj | grep -qF .rel.local && { \ + ${CROSS_COMPILE}readelf -S $$obj | grep -qF .rel.local && { \ echo "error: $$obj has data relocations!" >&2; \ exit 1; \ } || true; \ From 873f3b0ebbfe454f092d038a4bd1e0e17622c786 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 2 Jun 2017 14:46:11 -0700 Subject: [PATCH 076/235] ksm: prevent crash after write_protect_page fails commit a7306c3436e9c8e584a4b9fad5f3dc91be2a6076 upstream. "err" needs to be left set to -EFAULT if split_huge_page succeeds. Otherwise if "err" gets clobbered with zero and write_protect_page fails, try_to_merge_one_page() will succeed instead of returning -EFAULT and then try_to_merge_with_ksm_page() will continue thinking kpage is a PageKsm when in fact it's still an anonymous page. Eventually it'll crash in page_add_anon_rmap. This has been reproduced on Fedora25 kernel but I can reproduce with upstream too. The bug was introduced in commit f765f540598a ("ksm: prepare to new THP semantics") introduced in v4.5. page:fffff67546ce1cc0 count:4 mapcount:2 mapping:ffffa094551e36e1 index:0x7f0f46673 flags: 0x2ffffc0004007c(referenced|uptodate|dirty|lru|active|swapbacked) page dumped because: VM_BUG_ON_PAGE(!PageLocked(page)) page->mem_cgroup:ffffa09674bf0000 ------------[ cut here ]------------ kernel BUG at mm/rmap.c:1222! CPU: 1 PID: 76 Comm: ksmd Not tainted 4.9.3-200.fc25.x86_64 #1 RIP: do_page_add_anon_rmap+0x1c4/0x240 Call Trace: page_add_anon_rmap+0x18/0x20 try_to_merge_with_ksm_page+0x50b/0x780 ksm_scan_thread+0x1211/0x1410 ? prepare_to_wait_event+0x100/0x100 ? try_to_merge_with_ksm_page+0x780/0x780 kthread+0xd9/0xf0 ? kthread_park+0x60/0x60 ret_from_fork+0x25/0x30 Fixes: f765f54059 ("ksm: prepare to new THP semantics") Link: http://lkml.kernel.org/r/20170513131040.21732-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Federico Simoncelli Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/ksm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 9ae6011a41f8..caa54a55a357 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1002,8 +1002,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, goto out; if (PageTransCompound(page)) { - err = split_huge_page(page); - if (err) + if (split_huge_page(page)) goto out_unlock; } From c1bb2a899b5f5edc10cd5b5b9b0496cddb533565 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Jun 2017 14:46:25 -0700 Subject: [PATCH 077/235] slub/memcg: cure the brainless abuse of sysfs attributes commit 478fe3037b2278d276d4cd9cd0ab06c4cb2e9b32 upstream. memcg_propagate_slab_attrs() abuses the sysfs attribute file functions to propagate settings from the root kmem_cache to a newly created kmem_cache. It does that with: attr->show(root, buf); attr->store(new, buf, strlen(bug); Aside of being a lazy and absurd hackery this is broken because it does not check the return value of the show() function. Some of the show() functions return 0 w/o touching the buffer. That means in such a case the store function is called with the stale content of the previous show(). That causes nonsense like invoking kmem_cache_shrink() on a newly created kmem_cache. In the worst case it would cause handing in an uninitialized buffer. This should be rewritten proper by adding a propagate() callback to those slub_attributes which must be propagated and avoid that insane conversion to and from ASCII, but that's too large for a hot fix. Check at least the return value of the show() function, so calling store() with stale content is prevented. Steven said: "It can cause a deadlock with get_online_cpus() that has been uncovered by recent cpu hotplug and lockdep changes that Thomas and Peter have been doing. Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpu_hotplug.lock); lock(slab_mutex); lock(cpu_hotplug.lock); lock(slab_mutex); *** DEADLOCK ***" Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705201244540.2255@nanos Signed-off-by: Thomas Gleixner Reported-by: Steven Rostedt Acked-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/slub.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 58c7526f8de2..912aedda1633 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5452,6 +5452,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) char mbuf[64]; char *buf; struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + ssize_t len; if (!attr || !attr->store || !attr->show) continue; @@ -5476,8 +5477,9 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(root_cache, buf); - attr->store(s, buf, strlen(buf)); + len = attr->show(root_cache, buf); + if (len > 0) + attr->store(s, buf, len); } if (buffer) From 74b416367b4ea3aad7c4ff566345a4cf3cf5a80b Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Tue, 24 Jan 2017 15:18:02 -0800 Subject: [PATCH 078/235] mm/slub.c: trace free objects at KERN_INFO commit aa2efd5ea4041754da4046c3d2e7edaac9526258 upstream. Currently when trace is enabled (e.g. slub_debug=T,kmalloc-128 ) the trace messages are mostly output at KERN_INFO. However the trace code also calls print_section() to hexdump the head of a free object. This is hard coded to use KERN_ERR, meaning the console is deluged with trace messages even if we've asked for quiet. Fix this the obvious way but adding a level parameter to print_section(), allowing calls from the trace code to use the same trace level as other trace messages. Link: http://lkml.kernel.org/r/20170113154850.518-1-daniel.thompson@linaro.org Signed-off-by: Daniel Thompson Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- mm/slub.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 912aedda1633..edc79ca3c6d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -496,10 +496,11 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -static void print_section(char *text, u8 *addr, unsigned int length) +static void print_section(char *level, char *text, u8 *addr, + unsigned int length) { metadata_access_enable(); - print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, length, 1); metadata_access_disable(); } @@ -636,14 +637,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, + s->red_left_pad); else if (p > addr + 16) - print_section("Bytes b4 ", p - 16, 16); + print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); - print_section("Object ", p, min_t(unsigned long, s->object_size, - PAGE_SIZE)); + print_section(KERN_ERR, "Object ", p, + min_t(unsigned long, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p + s->object_size, + print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); if (s->offset) @@ -658,7 +660,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, size_from_object(s) - off); + print_section(KERN_ERR, "Padding ", p + off, + size_from_object(s) - off); dump_stack(); } @@ -820,7 +823,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding ", end - remainder, remainder); + print_section(KERN_ERR, "Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -973,7 +976,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object ", (void *)object, + print_section(KERN_INFO, "Object ", (void *)object, s->object_size); dump_stack(); From 670821b9482d037915e75bdd307ec87f231dd70a Mon Sep 17 00:00:00 2001 From: Patrik Jakobsson Date: Tue, 18 Apr 2017 13:43:32 +0200 Subject: [PATCH 079/235] drm/gma500/psb: Actually use VBT mode when it is found commit 82bc9a42cf854fdf63155759c0aa790bd1f361b0 upstream. With LVDS we were incorrectly picking the pre-programmed mode instead of the prefered mode provided by VBT. Make sure we pick the VBT mode if one is provided. It is likely that the mode read-out code is still wrong but this patch fixes the immediate problem on most machines. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=78562 Signed-off-by: Patrik Jakobsson Link: http://patchwork.freedesktop.org/patch/msgid/20170418114332.12183-1-patrik.r.jakobsson@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/gma500/psb_intel_lvds.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/gma500/psb_intel_lvds.c b/drivers/gpu/drm/gma500/psb_intel_lvds.c index fd7c91254841..79e9d3690667 100644 --- a/drivers/gpu/drm/gma500/psb_intel_lvds.c +++ b/drivers/gpu/drm/gma500/psb_intel_lvds.c @@ -774,20 +774,23 @@ void psb_intel_lvds_init(struct drm_device *dev, if (scan->type & DRM_MODE_TYPE_PREFERRED) { mode_dev->panel_fixed_mode = drm_mode_duplicate(dev, scan); + DRM_DEBUG_KMS("Using mode from DDC\n"); goto out; /* FIXME: check for quirks */ } } /* Failed to get EDID, what about VBT? do we need this? */ - if (mode_dev->vbt_mode) + if (dev_priv->lfp_lvds_vbt_mode) { mode_dev->panel_fixed_mode = - drm_mode_duplicate(dev, mode_dev->vbt_mode); + drm_mode_duplicate(dev, dev_priv->lfp_lvds_vbt_mode); - if (!mode_dev->panel_fixed_mode) - if (dev_priv->lfp_lvds_vbt_mode) - mode_dev->panel_fixed_mode = - drm_mode_duplicate(dev, - dev_priv->lfp_lvds_vbt_mode); + if (mode_dev->panel_fixed_mode) { + mode_dev->panel_fixed_mode->type |= + DRM_MODE_TYPE_PREFERRED; + DRM_DEBUG_KMS("Using mode from VBT\n"); + goto out; + } + } /* * If we didn't get EDID, try checking if the panel is already turned @@ -804,6 +807,7 @@ void psb_intel_lvds_init(struct drm_device *dev, if (mode_dev->panel_fixed_mode) { mode_dev->panel_fixed_mode->type |= DRM_MODE_TYPE_PREFERRED; + DRM_DEBUG_KMS("Using pre-programmed mode\n"); goto out; /* FIXME: check for quirks */ } } From c9eab63b9e6210a9bd3b124e43ad575f5f52a130 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:22 -0700 Subject: [PATCH 080/235] xfs: Fix missed holes in SEEK_HOLE implementation commit 5375023ae1266553a7baa0845e82917d8803f48c upstream. XFS SEEK_HOLE implementation could miss a hole in an unwritten extent as can be seen by the following command: xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "pwrite 128k 8k" -c "seek -h 0" file wrote 57344/57344 bytes at offset 0 56 KiB, 14 ops; 0.0000 sec (49.312 MiB/sec and 12623.9856 ops/sec) wrote 8192/8192 bytes at offset 131072 8 KiB, 2 ops; 0.0000 sec (70.383 MiB/sec and 18018.0180 ops/sec) Whence Result HOLE 139264 Where we can see that hole at offset 56k was just ignored by SEEK_HOLE implementation. The bug is in xfs_find_get_desired_pgoff() which does not properly detect the case when pages are not contiguous. Fix the problem by properly detecting when found page has larger offset than expected. Fixes: d126d43f631f996daeee5006714fed914be32368 Signed-off-by: Jan Kara Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_file.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1209ad29e902..438390f2e7c4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1163,17 +1163,6 @@ xfs_find_get_desired_pgoff( break; } - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; - break; - } - for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; loff_t b_offset; @@ -1185,18 +1174,18 @@ xfs_find_get_desired_pgoff( * file mapping. However, page->index will not change * because we have a reference on the page. * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. + * If current page offset is beyond where we've ended, + * we've found a hole. */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } + if (type == HOLE_OFF && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { + found = true; + *offset = lastoff; goto out; } + /* Searching done if the page index is out of range. */ + if (page->index > end) + goto out; lock_page(page); /* From 9c795fff53f934952c84e80e568b488492d8f805 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 31 May 2017 08:22:52 -0700 Subject: [PATCH 081/235] xfs: use ->b_state to fix buffer I/O accounting release race commit 63db7c815bc0997c29e484d2409684fdd9fcd93b upstream. We've had user reports of unmount hangs in xfs_wait_buftarg() that analysis shows is due to btp->bt_io_count == -1. bt_io_count represents the count of in-flight asynchronous buffers and thus should always be >= 0. xfs_wait_buftarg() waits for this value to stabilize to zero in order to ensure that all untracked (with respect to the lru) buffers have completed I/O processing before unmount proceeds to tear down in-core data structures. The value of -1 implies an I/O accounting decrement race. Indeed, the fact that xfs_buf_ioacct_dec() is called from xfs_buf_rele() (where the buffer lock is no longer held) means that bp->b_flags can be updated from an unsafe context. While a user-level reproducer is currently not available, some intrusive hacks to run racing buffer lookups/ioacct/releases from multiple threads was used to successfully manufacture this problem. Existing callers do not expect to acquire the buffer lock from xfs_buf_rele(). Therefore, we can not safely update ->b_flags from this context. It turns out that we already have separate buffer state bits and associated serialization for dealing with buffer LRU state in the form of ->b_state and ->b_lock. Therefore, replace the _XBF_IN_FLIGHT flag with a ->b_state variant, update the I/O accounting wrappers appropriately and make sure they are used with the correct locking. This ensures that buffer in-flight state can be modified at buffer release time without racing with modifications from a buffer lock holder. Fixes: 9c7504aa72b6 ("xfs: track and serialize in-flight async buffers against unmount") Signed-off-by: Brian Foster Reviewed-by: Nikolay Borisov Tested-by: Libor Pechacek Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_buf.c | 38 ++++++++++++++++++++++++++------------ fs/xfs/xfs_buf.h | 5 ++--- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d7a67d7fbc7f..93a00c925eb9 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -96,27 +96,41 @@ static inline void xfs_buf_ioacct_inc( struct xfs_buf *bp) { - if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + if (bp->b_flags & XBF_NO_IOACCT) return; ASSERT(bp->b_flags & XBF_ASYNC); - bp->b_flags |= _XBF_IN_FLIGHT; - percpu_counter_inc(&bp->b_target->bt_io_count); + spin_lock(&bp->b_lock); + if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { + bp->b_state |= XFS_BSTATE_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); + } + spin_unlock(&bp->b_lock); } /* * Clear the in-flight state on a buffer about to be released to the LRU or * freed and unaccount from the buftarg. */ +static inline void +__xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + ASSERT(spin_is_locked(&bp->b_lock)); + + if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { + bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); + } +} + static inline void xfs_buf_ioacct_dec( struct xfs_buf *bp) { - if (!(bp->b_flags & _XBF_IN_FLIGHT)) - return; - - bp->b_flags &= ~_XBF_IN_FLIGHT; - percpu_counter_dec(&bp->b_target->bt_io_count); + spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + spin_unlock(&bp->b_lock); } /* @@ -148,9 +162,9 @@ xfs_buf_stale( * unaccounted (released to LRU) before that occurs. Drop in-flight * status now to preserve accounting consistency. */ - xfs_buf_ioacct_dec(bp); - spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) @@ -953,12 +967,12 @@ xfs_buf_rele( * ensures the decrement occurs only once per-buf. */ if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); goto out_unlock; } /* the last reference has been dropped ... */ - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* * If the buffer is added to the LRU take a new reference to the diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1c2e52b2d926..2768b508280f 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -63,7 +63,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -83,14 +82,14 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_IN_FLIGHT, "IN_FLIGHT" } + { _XBF_COMPOUND, "COMPOUND" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ +#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ /* * The xfs_buftarg contains 2 notions of "sector size" - From 11b4854772859d5a755cd575f68e2e2e3234ef43 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 23 May 2017 08:30:46 -0700 Subject: [PATCH 082/235] xfs: fix off-by-one on max nr_pages in xfs_find_get_desired_pgoff() commit 8affebe16d79ebefb1d9d6d56a46dc89716f9453 upstream. xfs_find_get_desired_pgoff() is used to search for offset of hole or data in page range [index, end] (both inclusive), and the max number of pages to search should be at least one, if end == index. Otherwise the only page is missed and no hole or data is found, which is not correct. When block size is smaller than page size, this can be demonstrated by preallocating a file with size smaller than page size and writing data to the last block. E.g. run this xfs_io command on a 1k block size XFS on x86_64 host. # xfs_io -fc "falloc 0 3k" -c "pwrite 2k 1k" \ -c "seek -d 0" /mnt/xfs/testfile wrote 1024/1024 bytes at offset 2048 1 KiB, 1 ops; 0.0000 sec (33.675 MiB/sec and 34482.7586 ops/sec) Whence Result DATA EOF Data at offset 2k was missed, and lseek(2) returned ENXIO. This is uncovered by generic/285 subtest 07 and 08 on ppc64 host, where pagesize is 64k. Because a recent change to generic/285 reduced the preallocated file size to smaller than 64k. Signed-off-by: Eryu Guan Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 438390f2e7c4..9292a59efbfa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1136,7 +1136,7 @@ xfs_find_get_desired_pgoff( unsigned nr_pages; unsigned int i; - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); /* From 815414e7648b6d94cb81542dfb15cb71cbbd2ae9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Mar 2017 00:24:25 -0700 Subject: [PATCH 083/235] xfs: verify inline directory data forks commit 630a04e79dd41ff746b545d4fc052e0abb836120 upstream. When we're reading or writing the data fork of an inline directory, check the contents to make sure we're not overflowing buffers or eating garbage data. xfs/348 corrupts an inline symlink into an inline directory, triggering a buffer overflow bug. v2: add more checks consistent with _dir2_sf_check and make the verifier usable from anywhere. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_dir2_priv.h | 2 + fs/xfs/libxfs/xfs_dir2_sf.c | 87 ++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_fork.c | 26 ++++++++-- fs/xfs/libxfs/xfs_inode_fork.h | 2 +- fs/xfs/xfs_dir2_readdir.c | 11 ----- fs/xfs/xfs_inode.c | 12 +++-- 6 files changed, 122 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index ef9f6ead96a4..860be99407bf 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -126,6 +126,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp, + int size); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index c6809ff41197..96b45cd6c63f 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -629,6 +629,93 @@ xfs_dir2_sf_check( } #endif /* DEBUG */ +/* Verify the consistency of an inline directory. */ +int +xfs_dir2_sf_verify( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int size) +{ + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next_sfep; + char *endp; + const struct xfs_dir_ops *dops; + xfs_ino_t ino; + int i; + int i8count; + int offset; + __uint8_t filetype; + + dops = xfs_dir_get_ops(mp, NULL); + + /* + * Give up if the directory is way too short. + */ + XFS_WANT_CORRUPTED_RETURN(mp, size > + offsetof(struct xfs_dir2_sf_hdr, parent)); + XFS_WANT_CORRUPTED_RETURN(mp, size >= + xfs_dir2_sf_hdr_size(sfp->i8count)); + + endp = (char *)sfp + size; + + /* Check .. entry */ + ino = dops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + offset = dops->data_first_offset; + + /* Check all reported entries */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + /* + * struct xfs_dir2_sf_entry has a variable length. + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ + XFS_WANT_CORRUPTED_RETURN(mp, + ((char *)sfep + sizeof(*sfep)) < endp); + + /* Don't allow names with known bad length. */ + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0); + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN); + + /* + * Check that the variable-length part of the structure is + * within the data buffer. The next entry starts after the + * name component, so nextentry is an acceptable test. + */ + next_sfep = dops->sf_nextentry(sfp, sfep); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep); + + /* Check that the offsets always increase. */ + XFS_WANT_CORRUPTED_RETURN(mp, + xfs_dir2_sf_get_offset(sfep) >= offset); + + /* Check the inode number. */ + ino = dops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + + /* Check the file type. */ + filetype = dops->sf_get_ftype(sfep); + XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX); + + offset = xfs_dir2_sf_get_offset(sfep) + + dops->data_entsize(sfep->namelen); + + sfep = next_sfep; + } + XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count); + XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp); + + /* Make sure this whole thing ought to be in local format. */ + XFS_WANT_CORRUPTED_RETURN(mp, offset + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize); + + return 0; +} + /* * Create a new (shortform) directory. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 25c1e078aef6..9653e964eda4 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -33,6 +33,8 @@ #include "xfs_trace.h" #include "xfs_attr_sf.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" kmem_zone_t *xfs_ifork_zone; @@ -320,6 +322,7 @@ xfs_iformat_local( int whichfork, int size) { + int error; /* * If the size is unreasonable, then something @@ -336,6 +339,14 @@ xfs_iformat_local( return -EFSCORRUPTED; } + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(ip->i_mount, + (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip), + size); + if (error) + return error; + } + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -856,7 +867,7 @@ xfs_iextents_copy( * In these cases, the format always takes precedence, because the * format indicates the current state of the fork. */ -void +int xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, @@ -866,6 +877,7 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; + int error; static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -874,7 +886,7 @@ xfs_iflush_fork( { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; if (!iip) - return; + return 0; ifp = XFS_IFORK_PTR(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, @@ -882,12 +894,19 @@ xfs_iflush_fork( */ if (!ifp) { ASSERT(whichfork == XFS_ATTR_FORK); - return; + return 0; } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_LOCAL: + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(mp, + (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data, + ifp->if_bytes); + if (error) + return error; + } if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); @@ -940,6 +959,7 @@ xfs_iflush_fork( ASSERT(0); break; } + return 0; } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365326d1..132dc59fdde6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -140,7 +140,7 @@ typedef struct xfs_ifork { struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); -void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, +int xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); void xfs_idata_realloc(struct xfs_inode *, int, int); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 29816981b50a..6645a221bf8e 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -71,22 +71,11 @@ xfs_dir2_sf_getdents( struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) - return -EFSCORRUPTED; - /* * If the block number in the offset is out of range, we're done. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e50636c9a89c..5af5e76db7a3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3491,6 +3491,7 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); @@ -3573,9 +3574,14 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (error) + return error; + if (XFS_IFORK_Q(ip)) { + error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + if (error) + return error; + } xfs_inobp_check(mp, bp); /* From ca659e086fb7c1584d5e665ea51bd1abd687ea00 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 3 Apr 2017 12:22:20 -0700 Subject: [PATCH 084/235] xfs: rework the inline directory verifiers commit 78420281a9d74014af7616958806c3aba056319e upstream. The inline directory verifiers should be called on the inode fork data, which means after iformat_local on the read side, and prior to ifork_flush on the write side. This makes the fork verifier more consistent with the way buffer verifiers work -- i.e. they will operate on the memory buffer that the code will be reading and writing directly. Furthermore, revise the verifier function to return -EFSCORRUPTED so that we don't flood the logs with corruption messages and assert notices. This has been a particular problem with xfs/348, which triggers the XFS_WANT_CORRUPTED_RETURN assertions, which halts the kernel when CONFIG_XFS_DEBUG=y. Disk corruption isn't supposed to do that, at least not in a verifier. Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_dir2_priv.h | 3 +- fs/xfs/libxfs/xfs_dir2_sf.c | 63 ++++++++++++++++++++++------------ fs/xfs/libxfs/xfs_inode_fork.c | 35 +++++++------------ fs/xfs/libxfs/xfs_inode_fork.h | 2 +- fs/xfs/xfs_inode.c | 19 +++++----- 5 files changed, 66 insertions(+), 56 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 860be99407bf..699a51bb2cbe 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -126,8 +126,7 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); -extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp, - int size); +extern int xfs_dir2_sf_verify(struct xfs_inode *ip); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 96b45cd6c63f..e84af093b2ab 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -632,36 +632,49 @@ xfs_dir2_sf_check( /* Verify the consistency of an inline directory. */ int xfs_dir2_sf_verify( - struct xfs_mount *mp, - struct xfs_dir2_sf_hdr *sfp, - int size) + struct xfs_inode *ip) { + struct xfs_mount *mp = ip->i_mount; + struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; const struct xfs_dir_ops *dops; + struct xfs_ifork *ifp; xfs_ino_t ino; int i; int i8count; int offset; + int size; + int error; __uint8_t filetype; + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); + /* + * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops, + * so we can only trust the mountpoint to have the right pointer. + */ dops = xfs_dir_get_ops(mp, NULL); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; + size = ifp->if_bytes; + /* * Give up if the directory is way too short. */ - XFS_WANT_CORRUPTED_RETURN(mp, size > - offsetof(struct xfs_dir2_sf_hdr, parent)); - XFS_WANT_CORRUPTED_RETURN(mp, size >= - xfs_dir2_sf_hdr_size(sfp->i8count)); + if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) || + size < xfs_dir2_sf_hdr_size(sfp->i8count)) + return -EFSCORRUPTED; endp = (char *)sfp + size; /* Check .. entry */ ino = dops->sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; - XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + error = xfs_dir_ino_validate(mp, ino); + if (error) + return error; offset = dops->data_first_offset; /* Check all reported entries */ @@ -672,12 +685,12 @@ xfs_dir2_sf_verify( * Check the fixed-offset parts of the structure are * within the data buffer. */ - XFS_WANT_CORRUPTED_RETURN(mp, - ((char *)sfep + sizeof(*sfep)) < endp); + if (((char *)sfep + sizeof(*sfep)) >= endp) + return -EFSCORRUPTED; /* Don't allow names with known bad length. */ - XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0); - XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN); + if (sfep->namelen == 0) + return -EFSCORRUPTED; /* * Check that the variable-length part of the structure is @@ -685,33 +698,39 @@ xfs_dir2_sf_verify( * name component, so nextentry is an acceptable test. */ next_sfep = dops->sf_nextentry(sfp, sfep); - XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep); + if (endp < (char *)next_sfep) + return -EFSCORRUPTED; /* Check that the offsets always increase. */ - XFS_WANT_CORRUPTED_RETURN(mp, - xfs_dir2_sf_get_offset(sfep) >= offset); + if (xfs_dir2_sf_get_offset(sfep) < offset) + return -EFSCORRUPTED; /* Check the inode number. */ ino = dops->sf_get_ino(sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; - XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + error = xfs_dir_ino_validate(mp, ino); + if (error) + return error; /* Check the file type. */ filetype = dops->sf_get_ftype(sfep); - XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX); + if (filetype >= XFS_DIR3_FT_MAX) + return -EFSCORRUPTED; offset = xfs_dir2_sf_get_offset(sfep) + dops->data_entsize(sfep->namelen); sfep = next_sfep; } - XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count); - XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp); + if (i8count != sfp->i8count) + return -EFSCORRUPTED; + if ((void *)sfep != (void *)endp) + return -EFSCORRUPTED; /* Make sure this whole thing ought to be in local format. */ - XFS_WANT_CORRUPTED_RETURN(mp, offset + - (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + - (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize); + if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize) + return -EFSCORRUPTED; return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9653e964eda4..8a37efe04de3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -212,6 +212,16 @@ xfs_iformat_fork( if (error) return error; + /* Check inline dir contents. */ + if (S_ISDIR(VFS_I(ip)->i_mode) && + dip->di_format == XFS_DINODE_FMT_LOCAL) { + error = xfs_dir2_sf_verify(ip); + if (error) { + xfs_idestroy_fork(ip, XFS_DATA_FORK); + return error; + } + } + if (xfs_is_reflink_inode(ip)) { ASSERT(ip->i_cowfp == NULL); xfs_ifork_init_cow(ip); @@ -322,8 +332,6 @@ xfs_iformat_local( int whichfork, int size) { - int error; - /* * If the size is unreasonable, then something * is wrong and we just bail out rather than crash in @@ -339,14 +347,6 @@ xfs_iformat_local( return -EFSCORRUPTED; } - if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { - error = xfs_dir2_sf_verify(ip->i_mount, - (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip), - size); - if (error) - return error; - } - xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -867,7 +867,7 @@ xfs_iextents_copy( * In these cases, the format always takes precedence, because the * format indicates the current state of the fork. */ -int +void xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, @@ -877,7 +877,6 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; - int error; static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -886,7 +885,7 @@ xfs_iflush_fork( { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; if (!iip) - return 0; + return; ifp = XFS_IFORK_PTR(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, @@ -894,19 +893,12 @@ xfs_iflush_fork( */ if (!ifp) { ASSERT(whichfork == XFS_ATTR_FORK); - return 0; + return; } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_LOCAL: - if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { - error = xfs_dir2_sf_verify(mp, - (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data, - ifp->if_bytes); - if (error) - return error; - } if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); @@ -959,7 +951,6 @@ xfs_iflush_fork( ASSERT(0); break; } - return 0; } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 132dc59fdde6..7fb8365326d1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -140,7 +140,7 @@ typedef struct xfs_ifork { struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); -int xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, +void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); void xfs_idata_realloc(struct xfs_inode *, int, int); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5af5e76db7a3..eded851cbd85 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -50,6 +50,7 @@ #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_reflink.h" +#include "xfs_dir2_priv.h" kmem_zone_t *xfs_inode_zone; @@ -3491,7 +3492,6 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; - int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); @@ -3563,6 +3563,12 @@ xfs_iflush_int( if (ip->i_d.di_version < 3) ip->i_d.di_flushiter++; + /* Check the inline directory data. */ + if (S_ISDIR(VFS_I(ip)->i_mode) && + ip->i_d.di_format == XFS_DINODE_FMT_LOCAL && + xfs_dir2_sf_verify(ip)) + goto corrupt_out; + /* * Copy the dirty parts of the inode into the on-disk inode. We always * copy out the core of the inode, because if the inode is dirty at all @@ -3574,14 +3580,9 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (error) - return error; - if (XFS_IFORK_Q(ip)) { - error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); - if (error) - return error; - } + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (XFS_IFORK_Q(ip)) + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); xfs_inobp_check(mp, bp); /* From 3890d83805fe0f9c854fa3445358714aaa71d1ca Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 3 Apr 2017 12:22:39 -0700 Subject: [PATCH 085/235] xfs: fix kernel memory exposure problems commit bf9216f922612d2db7666aae01e65064da2ffb3a upstream. Fix a memory exposure problems in inumbers where we allocate an array of structures with holes, fail to zero the holes, then blindly copy the kernel memory contents (junk and all) into userspace. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_itable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 66e881790c17..d8a77dbf4e3a 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -585,7 +585,7 @@ xfs_inumbers( return error; bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); - buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); + buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP); do { struct xfs_inobt_rec_incore r; int stat; From c2ad2dc3d2648d603e7ab7dff4c6b38122206673 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 28 Mar 2017 14:51:44 -0700 Subject: [PATCH 086/235] xfs: use dedicated log worker wq to avoid deadlock with cil wq commit 696a562072e3c14bcd13ae5acc19cdf27679e865 upstream. The log covering background task used to be part of the xfssyncd workqueue. That workqueue was removed as of commit 5889608df ("xfs: syncd workqueue is no more") and the associated work item scheduled to the xfs-log wq. The latter is used for log buffer I/O completion. Since xfs_log_worker() can invoke a log flush, a deadlock is possible between the xfs-log and xfs-cil workqueues. Consider the following codepath from xfs_log_worker(): xfs_log_worker() xfs_log_force() _xfs_log_force() xlog_cil_force() xlog_cil_force_lsn() xlog_cil_push_now() flush_work() The above is in xfs-log wq context and blocked waiting on the completion of an xfs-cil work item. Concurrently, the cil push in progress can end up blocked here: xlog_cil_push_work() xlog_cil_push() xlog_write() xlog_state_get_iclog_space() xlog_wait(&log->l_flush_wait, ...) The above is in xfs-cil context waiting on log buffer I/O completion, which executes in xfs-log wq context. In this scenario both workqueues are deadlocked waiting on eachother. Add a new workqueue specifically for the high level log covering and ail pushing worker, as was the case prior to commit 5889608df. Diagnosed-by: David Jeffery Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4017aa967331..b57ab34fbf3c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1293,7 +1293,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1bf878b0492c..5415f9031ef8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -183,6 +183,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_sync_workqueue; /* * Generation of the filesysyem layout. This is incremented by each diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index dbbd3f1fd2b7..882fb8524fcb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -872,8 +872,15 @@ xfs_init_mount_workqueues( if (!mp->m_eofblocks_workqueue) goto out_destroy_log; + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, + mp->m_fsname); + if (!mp->m_sync_workqueue) + goto out_destroy_eofb; + return 0; +out_destroy_eofb: + destroy_workqueue(mp->m_eofblocks_workqueue); out_destroy_log: destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: @@ -894,6 +901,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_sync_workqueue); destroy_workqueue(mp->m_eofblocks_workqueue); destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); From de417ea6b0a607e79b765838d397c0eb0bcb62d9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 3 Apr 2017 15:17:57 -0700 Subject: [PATCH 087/235] xfs: fix over-copying of getbmap parameters from userspace commit be6324c00c4d1e0e665f03ed1fc18863a88da119 upstream. In xfs_ioc_getbmap, we should only copy the fields of struct getbmap from userspace, or else we end up copying random stack contents into the kernel. struct getbmap is a strict subset of getbmapx, so a partial structure copy should work fine. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a39197501a7c..73cfc7179124 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1542,10 +1542,11 @@ xfs_ioc_getbmap( unsigned int cmd, void __user *arg) { - struct getbmapx bmx; + struct getbmapx bmx = { 0 }; int error; - if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + /* struct getbmap is a strict subset of struct getbmapx. */ + if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) return -EFAULT; if (bmx.bmv_count < 2) From 4e8163fc8159590c031783bfb0d548ca45f4ebdf Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 6 Apr 2017 16:00:39 -0700 Subject: [PATCH 088/235] xfs: actually report xattr extents via iomap commit 84358536dc355a9c8978ee425f87e116186bed16 upstream. Apparently FIEMAP for xattrs has been broken since we switched to the iomap backend because of an incorrect check for xattr presence. Also fix the broken locking. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_iomap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 360562484e7b..65740d1cbd92 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1151,10 +1151,10 @@ xfs_xattr_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - lockmode = xfs_ilock_data_map_shared(ip); + lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { error = -ENOENT; goto out_unlock; } From 4e2762878a59072f2ba1e02987d22ef77921791e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 11 Apr 2017 10:50:05 -0700 Subject: [PATCH 089/235] xfs: drop iolock from reclaim context to appease lockdep commit 3b4683c294095b5f777c03307ef8c60f47320e12 upstream. Lockdep complains about use of the iolock in inode reclaim context because it doesn't understand that reclaim has the last reference to the inode, and thus an iolock->reclaim->iolock deadlock is not possible. The iolock is technically not necessary in xfs_inactive() and was only added to appease an assert in xfs_free_eofblocks(), which can be called from other non-reclaim contexts. Therefore, just kill the assert and drop the use of the iolock from reclaim context to quiet lockdep. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_bmap_util.c | 8 +++----- fs/xfs/xfs_inode.c | 9 +++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5328ecdd03d4..8a1a62ee84ad 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -911,9 +911,9 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) } /* - * This is called by xfs_inactive to free any blocks beyond eof - * when the link count isn't zero and by xfs_dm_punch_hole() when - * punching a hole to EOF. + * This is called to free any blocks beyond eof. The caller must hold + * IOLOCK_EXCL unless we are in the inode reclaim path and have the only + * reference to the inode. */ int xfs_free_eofblocks( @@ -928,8 +928,6 @@ xfs_free_eofblocks( struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index eded851cbd85..7a0b4eeb99e4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1915,12 +1915,13 @@ xfs_inactive( * force is true because we are evicting an inode from the * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. + * + * Note: don't bother with iolock here since lockdep complains + * about acquiring it in reclaim context. We have the only + * reference to the inode at this point anyways. */ - if (xfs_can_free_eofblocks(ip, true)) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (xfs_can_free_eofblocks(ip, true)) xfs_free_eofblocks(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } return; } From 99226b890d63c5aaf8b5ccc95b17d1a8087f45c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Apr 2017 16:45:52 -0700 Subject: [PATCH 090/235] xfs: fix integer truncation in xfs_bmap_remap_alloc commit 52813fb13ff90bd9c39a93446cbf1103c290b6e9 upstream. bno should be a xfs_fsblock_t, which is 64-bit wides instead of a xfs_aglock_t, which truncates the value to 32 bits. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 5a508b011e27..23f2b4b36f38 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3964,7 +3964,7 @@ xfs_bmap_remap_alloc( { struct xfs_trans *tp = ap->tp; struct xfs_mount *mp = tp->t_mountp; - xfs_agblock_t bno; + xfs_fsblock_t bno; struct xfs_alloc_arg args; int error; From 93bd169845e5d5911e51509d7c7e4cba4a3a2d5a Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 13 Apr 2017 15:15:47 -0700 Subject: [PATCH 091/235] xfs: handle array index overrun in xfs_dir2_leaf_readbuf() commit 023cc840b40fad95c6fe26fff1d380a8c9d45939 upstream. Carlos had a case where "find" seemed to start spinning forever and never return. This was on a filesystem with non-default multi-fsb (8k) directory blocks, and a fragmented directory with extents like this: 0:[0,133646,2,0] 1:[2,195888,1,0] 2:[3,195890,1,0] 3:[4,195892,1,0] 4:[5,195894,1,0] 5:[6,195896,1,0] 6:[7,195898,1,0] 7:[8,195900,1,0] 8:[9,195902,1,0] 9:[10,195908,1,0] 10:[11,195910,1,0] 11:[12,195912,1,0] 12:[13,195914,1,0] ... i.e. the first extent is a contiguous 2-fsb dir block, but after that it is fragmented into 1 block extents. At the top of the readdir path, we allocate a mapping array which (for this filesystem geometry) can hold 10 extents; see the assignment to map_info->map_size. During readdir, we are therefore able to map extents 0 through 9 above into the array for readahead purposes. If we count by 2, we see that the last mapped index (9) is the first block of a 2-fsb directory block. At the end of xfs_dir2_leaf_readbuf() we have 2 loops to fill more readahead; the outer loop assumes one full dir block is processed each loop iteration, and an inner loop that ensures that this is so by advancing to the next extent until a full directory block is mapped. The problem is that this inner loop may step past the last extent in the mapping array as it tries to reach the end of the directory block. This will read garbage for the extent length, and as a result the loop control variable 'j' may become corrupted and never fail the loop conditional. The number of valid mappings we have in our array is stored in map->map_valid, so stop this inner loop based on that limit. There is an ASSERT at the top of the outer loop for this same condition, but we never made it out of the inner loop, so the ASSERT never fired. Huge appreciation for Carlos for debugging and isolating the problem. Debugged-and-analyzed-by: Carlos Maiolino Signed-off-by: Eric Sandeen Tested-by: Carlos Maiolino Reviewed-by: Carlos Maiolino Reviewed-by: Bill O'Donnell Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_dir2_readdir.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 6645a221bf8e..ba95846f8f8a 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf( /* * Do we need more readahead? + * Each loop tries to process 1 full dir blk; last may be partial. */ blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; @@ -425,9 +426,14 @@ xfs_dir2_leaf_readbuf( } /* - * Advance offset through the mapping table. + * Advance offset through the mapping table, processing a full + * dir block even if it is fragmented into several extents. + * But stop if we have consumed all valid mappings, even if + * it's not yet a full directory block. */ - for (j = 0; j < geo->fsbcount; j += length ) { + for (j = 0; + j < geo->fsbcount && mip->ra_index < mip->map_valid; + j += length ) { /* * The rest of this extent but not more than a dir * block. From 95487d4be1e9eb7a43120d852c86c80803cfe51d Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 20 Apr 2017 08:06:47 -0700 Subject: [PATCH 092/235] xfs: prevent multi-fsb dir readahead from reading random blocks commit cb52ee334a45ae6c78a3999e4b473c43ddc528f4 upstream. Directory block readahead uses a complex iteration mechanism to map between high-level directory blocks and underlying physical extents. This mechanism attempts to traverse the higher-level dir blocks in a manner that handles multi-fsb directory blocks and simultaneously maintains a reference to the corresponding physical blocks. This logic doesn't handle certain (discontiguous) physical extent layouts correctly with multi-fsb directory blocks. For example, consider the case of a 4k FSB filesystem with a 2 FSB (8k) directory block size and a directory with the following extent layout: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..7]: 88..95 0 (88..95) 8 1: [8..15]: 80..87 0 (80..87) 8 2: [16..39]: 168..191 0 (168..191) 24 3: [40..63]: 5242952..5242975 1 (72..95) 24 Directory block 0 spans physical extents 0 and 1, dirblk 1 lies entirely within extent 2 and dirblk 2 spans extents 2 and 3. Because extent 2 is larger than the directory block size, the readahead code erroneously assumes the block is contiguous and issues a readahead based on the physical mapping of the first fsb of the dirblk. This results in read verifier failure and a spurious corruption or crc failure, depending on the filesystem format. Further, the subsequent readahead code responsible for walking through the physical table doesn't correctly advance the physical block reference for dirblk 2. Instead of advancing two physical filesystem blocks, the first iteration of the loop advances 1 block (correctly), but the subsequent iteration advances 2 more physical blocks because the next physical extent (extent 3, above) happens to cover more than dirblk 2. At this point, the higher-level directory block walking is completely off the rails of the actual physical layout of the directory for the respective mapping table. Update the contiguous dirblock logic to consider the current offset in the physical extent to avoid issuing directory readahead to unrelated blocks. Also, update the mapping table advancing code to consider the current offset within the current dirblock to avoid advancing the mapping reference too far beyond the dirblock. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_dir2_readdir.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index ba95846f8f8a..eba63160be6c 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -405,7 +405,8 @@ xfs_dir2_leaf_readbuf( * Read-ahead a contiguous directory block. */ if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= geo->fsbcount) { + (map[mip->ra_index].br_blockcount - mip->ra_offset) >= + geo->fsbcount) { xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(dp->i_mount, @@ -438,7 +439,7 @@ xfs_dir2_leaf_readbuf( * The rest of this extent but not more than a dir * block. */ - length = min_t(int, geo->fsbcount, + length = min_t(int, geo->fsbcount - j, map[mip->ra_index].br_blockcount - mip->ra_offset); mip->ra_offset += length; From 10f0b2c3c225bae7eddf88a18466b9133a716c72 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 21 Apr 2017 12:40:44 -0700 Subject: [PATCH 093/235] xfs: fix up quotacheck buffer list error handling commit 20e8a063786050083fe05b4f45be338c60b49126 upstream. The quotacheck error handling of the delwri buffer list assumes the resident buffers are locked and doesn't clear the _XBF_DELWRI_Q flag on the buffers that are dequeued. This can lead to assert failures on buffer release and possibly other locking problems. Move this code to a delwri queue cancel helper function to encapsulate the logic required to properly release buffers from a delwri queue. Update the helper to clear the delwri queue flag and call it from quotacheck. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_buf.c | 24 ++++++++++++++++++++++++ fs/xfs/xfs_buf.h | 1 + fs/xfs/xfs_qm.c | 7 +------ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 93a00c925eb9..16269271ebd6 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1066,6 +1066,8 @@ void xfs_buf_unlock( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1803,6 +1805,28 @@ error: return NULL; } +/* + * Cancel a delayed write list. + * + * Remove each buffer from the list, clear the delwri queue flag and drop the + * associated buffer reference. + */ +void +xfs_buf_delwri_cancel( + struct list_head *list) +{ + struct xfs_buf *bp; + + while (!list_empty(list)) { + bp = list_first_entry(list, struct xfs_buf, b_list); + + xfs_buf_lock(bp); + bp->b_flags &= ~_XBF_DELWRI_Q; + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } +} + /* * Add a buffer to the delayed write list. * diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 2768b508280f..ad514a8025dd 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -329,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t); extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ +extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index b669b123287b..8b9a9f15f022 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1384,12 +1384,7 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: - while (!list_empty(&buffer_list)) { - struct xfs_buf *bp = - list_first_entry(&buffer_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); - xfs_buf_relse(bp); - } + xfs_buf_delwri_cancel(&buffer_list); if (error) { xfs_warn(mp, From e86b616b5b9ed5023ceef154764e327507e30007 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 26 Apr 2017 08:30:39 -0700 Subject: [PATCH 094/235] xfs: support ability to wait on new inodes commit 756baca27fff3ecaeab9dbc7a5ee35a1d7bc0c7f upstream. Inodes that are inserted into the perag tree but still under construction are flagged with the XFS_INEW bit. Most contexts either skip such inodes when they are encountered or have the ability to handle them. The runtime quotaoff sequence introduces a context that must wait for construction of such inodes to correctly ensure that all dquots in the fs are released. In anticipation of this, support the ability to wait on new inodes. Wake the appropriate bit when XFS_INEW is cleared. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_icache.c | 5 ++++- fs/xfs/xfs_inode.h | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3fb1f3fb8efe..027cf8ba2235 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -368,14 +368,17 @@ xfs_iget_cache_hit( error = xfs_reinit_inode(mp, inode); if (error) { + bool wake; /* * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ rcu_read_lock(); spin_lock(&ip->i_flags_lock); - + wake = !!__xfs_iflags_test(ip, XFS_INEW); ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); ASSERT(ip->i_flags & XFS_IRECLAIMABLE); trace_xfs_iget_reclaim_fail(ip); goto out_error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 71e8a81c91a3..c038f6eecc28 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -217,7 +217,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define __XFS_INEW_BIT 3 /* inode has just been allocated */ +#define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -467,6 +468,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) From 2ea882d8ebc7cd6e95b09479af9f5f5692818fdb Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 26 Apr 2017 08:30:39 -0700 Subject: [PATCH 095/235] xfs: update ag iterator to support wait on new inodes commit ae2c4ac2dd39b23a87ddb14ceddc3f2872c6aef5 upstream. The AG inode iterator currently skips new inodes as such inodes are inserted into the inode radix tree before they are fully constructed. Certain contexts require the ability to wait on the construction of new inodes, however. The fs-wide dquot release from the quotaoff sequence is an example of this. Update the AG inode iterator to support the ability to wait on inodes flagged with XFS_INEW upon request. Create a new xfs_inode_ag_iterator_flags() interface and support a set of iteration flags to modify the iteration behavior. When the XFS_AGITER_INEW_WAIT flag is set, include XFS_INEW flags in the radix tree inode lookup and wait on them before the callback is executed. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_icache.c | 53 ++++++++++++++++++++++++++++++++++++++------- fs/xfs/xfs_icache.h | 8 +++++++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 027cf8ba2235..74304b6ce84b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -264,6 +264,22 @@ xfs_inode_clear_reclaim_tag( xfs_perag_clear_reclaim_tag(pag); } +static void +xfs_inew_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (!xfs_iflags_test(ip, XFS_INEW)) + break; + schedule(); + } while (true); + finish_wait(wq, &wait.wait); +} + /* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store @@ -628,9 +644,11 @@ out_error_or_again: STATIC int xfs_inode_ag_walk_grab( - struct xfs_inode *ip) + struct xfs_inode *ip, + int flags) { struct inode *inode = VFS_I(ip); + bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -648,7 +666,8 @@ xfs_inode_ag_walk_grab( goto out_unlock_noent; /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || + __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock); @@ -676,7 +695,8 @@ xfs_inode_ag_walk( void *args), int flags, void *args, - int tag) + int tag, + int iter_flags) { uint32_t first_index; int last_error = 0; @@ -718,7 +738,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip)) + if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -746,6 +766,9 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; + if ((iter_flags & XFS_AGITER_INEW_WAIT) && + xfs_iflags_test(batch[i], XFS_INEW)) + xfs_inew_wait(batch[i]); error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == -EAGAIN) { @@ -825,12 +848,13 @@ xfs_cowblocks_worker( } int -xfs_inode_ag_iterator( +xfs_inode_ag_iterator_flags( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, - void *args) + void *args, + int iter_flags) { struct xfs_perag *pag; int error = 0; @@ -840,7 +864,8 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, + iter_flags); xfs_perag_put(pag); if (error) { last_error = error; @@ -851,6 +876,17 @@ xfs_inode_ag_iterator( return last_error; } +int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); +} + int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, @@ -868,7 +904,8 @@ xfs_inode_ag_iterator_tag( ag = 0; while ((pag = xfs_perag_get_tag(mp, ag, tag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, + 0); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 8a7c849b4dea..9183f77958ef 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -48,6 +48,11 @@ struct xfs_eofblocks { #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 +/* + * flags for AG inode iterator + */ +#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); +int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int iter_flags); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); From 0ba833fe73d246b0bfb76403b7b67a890cf87b5e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 26 Apr 2017 08:30:40 -0700 Subject: [PATCH 096/235] xfs: wait on new inodes during quotaoff dquot release commit e20c8a517f259cb4d258e10b0cd5d4b30d4167a0 upstream. The quotaoff operation has a race with inode allocation that results in a livelock. An inode allocation that occurs before the quota status flags are updated acquires the appropriate dquots for the inode via xfs_qm_vop_dqalloc(). It then inserts the XFS_INEW inode into the perag radix tree, sometime later attaches the dquots to the inode and finally clears the XFS_INEW flag. Quotaoff expects to release the dquots from all inodes in the filesystem via xfs_qm_dqrele_all_inodes(). This invokes the AG inode iterator, which skips inodes in the XFS_INEW state because they are not fully constructed. If the scan occurs after dquots have been attached to an inode, but before XFS_INEW is cleared, the newly allocated inode will continue to hold a reference to the applicable dquots. When quotaoff invokes xfs_qm_dqpurge_all(), the reference count of those dquot(s) remain elevated and the dqpurge scan spins indefinitely. To address this problem, update the xfs_qm_dqrele_all_inodes() scan to wait on inodes marked on the XFS_INEW state. We wait on the inodes explicitly rather than skip and retry to avoid continuous retry loops due to a parallel inode allocation workload. Since quotaoff updates the quota state flags and uses a synchronous transaction before the dqrele scan, and dquots are attached to inodes after radix tree insertion iff quota is enabled, one INEW waiting pass through the AG guarantees that the scan has processed all inodes that could possibly hold dquot references. Reported-by: Eryu Guan Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_qm_syscalls.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 475a3882a81f..9cb5c381b01c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); + xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, + XFS_AGITER_INEW_WAIT); } From d457f822817f9ab3a28b89b090dee5e4c5fb7ddf Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 12 Apr 2017 12:26:07 -0700 Subject: [PATCH 097/235] xfs: reserve enough blocks to handle btree splits when remapping commit fe0be23e68200573de027de9b8cc2b27e7fce35e upstream. In xfs_reflink_end_cow, we erroneously reserve only enough blocks to handle adding 1 extent. This is problematic if we fragment free space, have to do CoW, and then have to perform multiple bmap btree expansions. Furthermore, the BUI recovery routine doesn't reserve /any/ blocks to handle btree splits, so log recovery fails after our first error causes the filesystem to go down. Therefore, refactor the transaction block reservation macros until we have a macro that works for our deferred (re)mapping activities, and fix both problems by using that macro. With 1k blocks we can hit this fairly often in g/187 if the scratch fs is big enough. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_trans_space.h | 23 +++++++++++++++++------ fs/xfs/xfs_bmap_item.c | 5 ++++- fs/xfs/xfs_reflink.c | 18 ++++++++++++++++-- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 7917f6e44286..d787c677d2a3 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,8 +21,20 @@ /* * Components of space reservations. */ + +/* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) + +/* Adding one rmap could split every level up to the top of the tree. */ +#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) + +/* Blocks we might need to add "b" rmaps to a tree. */ +#define XFS_NRMAPADD_SPACE_RES(mp, b)\ + (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + XFS_RMAPADD_SPACE_RES(mp)) + #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -30,13 +42,12 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) + +/* Blocks we might need to add "b" mappings & rmappings to a file. */ #define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ - (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ - XFS_EXTENTADD_SPACE_RES(mp,w) + \ - ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ - (mp)->m_rmap_maxlevels) + (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \ + XFS_NRMAPADD_SPACE_RES((mp), (b))) + #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 9bf57c76623b..c4b90e794e41 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -34,6 +34,8 @@ #include "xfs_bmap.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" kmem_zone_t *xfs_bui_zone; @@ -446,7 +448,8 @@ xfs_bui_recover( return -EIO; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) return error; budp = xfs_trans_get_bud(tp, buip); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 2252f163c38f..29a75ecb2425 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -736,8 +736,22 @@ xfs_reflink_end_cow( offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + /* + * Start a rolling transaction to switch the mappings. We're + * unlikely ever to have to remap 16T worth of single-block + * extents, so just cap the worst case extent count to 2^32-1. + * Stick a warning in just in case, and avoid 64-bit division. + */ + BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); + if (end_fsb - offset_fsb > UINT_MAX) { + error = -EFSCORRUPTED; + xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); + ASSERT(0); + goto out; + } + resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, + (unsigned int)(end_fsb - offset_fsb), + XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, resblks, 0, 0, &tp); if (error) From 54894ea3c5420170597d1f6a4343637e68b7a3e2 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 2 May 2017 13:54:47 -0700 Subject: [PATCH 098/235] xfs: fix use-after-free in xfs_finish_page_writeback commit 161f55efba5ddccc690139fae9373cafc3447a97 upstream. Commit 28b783e47ad7 ("xfs: bufferhead chains are invalid after end_page_writeback") fixed one use-after-free issue by pre-calculating the loop conditionals before calling bh->b_end_io() in the end_io processing loop, but it assigned 'next' pointer before checking end offset boundary & breaking the loop, at which point the bh might be freed already, and caused use-after-free. This is caught by KASAN when running fstests generic/127 on sub-page block size XFS. [ 2517.244502] run fstests generic/127 at 2017-04-27 07:30:50 [ 2747.868840] ================================================================== [ 2747.876949] BUG: KASAN: use-after-free in xfs_destroy_ioend+0x3d3/0x4e0 [xfs] at addr ffff8801395ae698 ... [ 2747.918245] Call Trace: [ 2747.920975] dump_stack+0x63/0x84 [ 2747.924673] kasan_object_err+0x21/0x70 [ 2747.928950] kasan_report+0x271/0x530 [ 2747.933064] ? xfs_destroy_ioend+0x3d3/0x4e0 [xfs] [ 2747.938409] ? end_page_writeback+0xce/0x110 [ 2747.943171] __asan_report_load8_noabort+0x19/0x20 [ 2747.948545] xfs_destroy_ioend+0x3d3/0x4e0 [xfs] [ 2747.953724] xfs_end_io+0x1af/0x2b0 [xfs] [ 2747.958197] process_one_work+0x5ff/0x1000 [ 2747.962766] worker_thread+0xe4/0x10e0 [ 2747.966946] kthread+0x2d3/0x3d0 [ 2747.970546] ? process_one_work+0x1000/0x1000 [ 2747.975405] ? kthread_create_on_node+0xc0/0xc0 [ 2747.980457] ? syscall_return_slowpath+0xe6/0x140 [ 2747.985706] ? do_page_fault+0x30/0x80 [ 2747.989887] ret_from_fork+0x2c/0x40 [ 2747.993874] Object at ffff8801395ae690, in cache buffer_head size: 104 [ 2748.001155] Allocated: [ 2748.003782] PID = 8327 [ 2748.006411] save_stack_trace+0x1b/0x20 [ 2748.010688] save_stack+0x46/0xd0 [ 2748.014383] kasan_kmalloc+0xad/0xe0 [ 2748.018370] kasan_slab_alloc+0x12/0x20 [ 2748.022648] kmem_cache_alloc+0xb8/0x1b0 [ 2748.027024] alloc_buffer_head+0x22/0xc0 [ 2748.031399] alloc_page_buffers+0xd1/0x250 [ 2748.035968] create_empty_buffers+0x30/0x410 [ 2748.040730] create_page_buffers+0x120/0x1b0 [ 2748.045493] __block_write_begin_int+0x17a/0x1800 [ 2748.050740] iomap_write_begin+0x100/0x2f0 [ 2748.055308] iomap_zero_range_actor+0x253/0x5c0 [ 2748.060362] iomap_apply+0x157/0x270 [ 2748.064347] iomap_zero_range+0x5a/0x80 [ 2748.068624] iomap_truncate_page+0x6b/0xa0 [ 2748.073227] xfs_setattr_size+0x1f7/0xa10 [xfs] [ 2748.078312] xfs_vn_setattr_size+0x68/0x140 [xfs] [ 2748.083589] xfs_file_fallocate+0x4ac/0x820 [xfs] [ 2748.088838] vfs_fallocate+0x2cf/0x780 [ 2748.093021] SyS_fallocate+0x48/0x80 [ 2748.097006] do_syscall_64+0x18a/0x430 [ 2748.101186] return_from_SYSCALL_64+0x0/0x6a [ 2748.105948] Freed: [ 2748.108189] PID = 8327 [ 2748.110816] save_stack_trace+0x1b/0x20 [ 2748.115093] save_stack+0x46/0xd0 [ 2748.118788] kasan_slab_free+0x73/0xc0 [ 2748.122969] kmem_cache_free+0x7a/0x200 [ 2748.127247] free_buffer_head+0x41/0x80 [ 2748.131524] try_to_free_buffers+0x178/0x250 [ 2748.136316] xfs_vm_releasepage+0x2e9/0x3d0 [xfs] [ 2748.141563] try_to_release_page+0x100/0x180 [ 2748.146325] invalidate_inode_pages2_range+0x7da/0xcf0 [ 2748.152087] xfs_shift_file_space+0x37d/0x6e0 [xfs] [ 2748.157557] xfs_collapse_file_space+0x49/0x120 [xfs] [ 2748.163223] xfs_file_fallocate+0x2a7/0x820 [xfs] [ 2748.168462] vfs_fallocate+0x2cf/0x780 [ 2748.172642] SyS_fallocate+0x48/0x80 [ 2748.176629] do_syscall_64+0x18a/0x430 [ 2748.180810] return_from_SYSCALL_64+0x0/0x6a Fixed it by checking on offset against end & breaking out first, dereference bh only if there're still bufferheads to process. Signed-off-by: Eryu Guan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_aops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0457abe4118a..6df0a7ce3e8a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -116,11 +116,11 @@ xfs_finish_page_writeback( bsize = bh->b_size; do { + if (off > end) + break; next = bh->b_this_page; if (off < bvec->bv_offset) goto next_bh; - if (off > end) - break; bh->b_end_io(bh, !error); next_bh: off += bsize; From 53c44c236f218fbb813ce45b79da7b7d9938bf68 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 12 May 2017 10:44:08 -0700 Subject: [PATCH 099/235] xfs: fix indlen accounting error on partial delalloc conversion commit 0daaecacb83bc6b656a56393ab77a31c28139bc7 upstream. The delalloc -> real block conversion path uses an incorrect calculation in the case where the middle part of a delalloc extent is being converted. This is documented as a rare situation because XFS generally attempts to maximize contiguity by converting as much of a delalloc extent as possible. If this situation does occur, the indlen reservation for the two new delalloc extents left behind by the conversion of the middle range is calculated and compared with the original reservation. If more blocks are required, the delta is allocated from the global block pool. This delta value can be characterized as the difference between the new total requirement (temp + temp2) and the currently available reservation minus those blocks that have already been allocated (startblockval(PREV.br_startblock) - allocated). The problem is that the current code does not account for previously allocated blocks correctly. It subtracts the current allocation count from the (new - old) delta rather than the old indlen reservation. This means that more indlen blocks than have been allocated end up stashed in the remaining extents and free space accounting is broken as a result. Fix up the calculation to subtract the allocated block count from the original extent indlen and thus correctly allocate the reservation delta based on the difference between the new total requirement and the unused blocks from the original reservation. Also remove a bogus assert that contradicts the fact that the new indlen reservation can be larger than the original indlen reservation. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_bmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 23f2b4b36f38..2a8cbd15d5d1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2208,8 +2208,10 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(bma->ip, temp); temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + diff = (int)(temp + temp2 - + (startblockval(PREV.br_startblock) - + (bma->cur ? + bma->cur->bc_private.b.allocated : 0))); if (diff > 0) { error = xfs_mod_fdblocks(bma->ip->i_mount, -((int64_t)diff), false); @@ -2266,7 +2268,6 @@ xfs_bmap_add_extent_delay_real( temp = da_new; if (bma->cur) temp += bma->cur->bc_private.b.allocated; - ASSERT(temp <= da_old); if (temp < da_old) xfs_mod_fdblocks(bma->ip->i_mount, (int64_t)(da_old - temp), false); From f60d76efa91a1450c73589d9c61b3af74bf2cf73 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:08 -0700 Subject: [PATCH 100/235] xfs: BMAPX shouldn't barf on inline-format directories commit 6eadbf4c8ba816c10d1c97bed9aa861d9fd17809 upstream. When we're fulfilling a BMAPX request, jump out early if the data fork is in local format. This prevents us from hitting a debugging check in bmapi_read and barfing errors back to userspace. The on-disk extent count check later isn't sufficient for IF_DELALLOC mode because da extents are in memory and not on disk. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_bmap_util.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 8a1a62ee84ad..15be2c973c40 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -588,9 +588,13 @@ xfs_getbmap( } break; default: + /* Local format data forks report no extents. */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + bmv->bmv_entries = 0; + return 0; + } if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) return -EINVAL; if (xfs_get_extsz_hint(ip) || From 0e542792a046c788f39ab13fcd68c7ba65930407 Mon Sep 17 00:00:00 2001 From: Zorro Lang Date: Mon, 15 May 2017 08:40:02 -0700 Subject: [PATCH 101/235] xfs: bad assertion for delalloc an extent that start at i_size commit 892d2a5f705723b2cb488bfb38bcbdcf83273184 upstream. By run fsstress long enough time enough in RHEL-7, I find an assertion failure (harder to reproduce on linux-4.11, but problem is still there): XFS: Assertion failed: (iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c The assertion is in xfs_getbmap() funciton: if (map[i].br_startblock == DELAYSTARTBLOCK && --> map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); When map[i].br_startoff == XFS_B_TO_FSB(mp, XFS_ISIZE(ip)), the startoff is just at EOF. But we only need to make sure delalloc extents that are within EOF, not include EOF. Signed-off-by: Zorro Lang Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_bmap_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 15be2c973c40..87b495e2f15a 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -722,7 +722,7 @@ xfs_getbmap( * extents. */ if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); if (map[i].br_startblock == HOLESTARTBLOCK && From e40c145c023db15a0504e95ac442f0e47c0a816d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 15:18:31 +0200 Subject: [PATCH 102/235] xfs: xfs_trans_alloc_empty This is a partial cherry-pick of commit e89c041338 ("xfs: implement the GETFSMAP ioctl"), which also adds this helper, and a great example of why feature patches should be properly split into their parts. Signed-off-by: Darrick J. Wong [hch: split from the larger patch for -stable] Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_trans.c | 22 ++++++++++++++++++++++ fs/xfs/xfs_trans.h | 2 ++ 2 files changed, 24 insertions(+) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 70f42ea86dfb..a280e126491f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -262,6 +262,28 @@ xfs_trans_alloc( return 0; } +/* + * Create an empty transaction with no reservation. This is a defensive + * mechanism for routines that query metadata without actually modifying + * them -- if the metadata being queried is somehow cross-linked (think a + * btree block pointer that points higher in the tree), we risk deadlock. + * However, blocks grabbed as part of a transaction can be re-grabbed. + * The verifiers will notice the corrupt block and the operation will fail + * back to userspace without deadlocking. + * + * Note the zero-length reservation; this transaction MUST be cancelled + * without any dirty data. + */ +int +xfs_trans_alloc_empty( + struct xfs_mount *mp, + struct xfs_trans **tpp) +{ + struct xfs_trans_res resv = {0}; + + return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp); +} + /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 61b7fbdd3ebd..98024cb933ef 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -159,6 +159,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_alloc_empty(struct xfs_mount *mp, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, From 7fb8ab8f0a38adb97cf2cf0738628959db5d840d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 May 2017 19:16:15 -0700 Subject: [PATCH 103/235] xfs: avoid mount-time deadlock in CoW extent recovery commit 3ecb3ac7b950ff8f6c6a61e8b7b0d6e3546429a0 upstream. If a malicious user corrupts the refcount btree to cause a cycle between different levels of the tree, the next mount attempt will deadlock in the CoW recovery routine while grabbing buffer locks. We can use the ability to re-grab a buffer that was previous locked to a transaction to avoid deadlocks, so do that here. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_refcount.c | 43 ++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b177ef33cd4c..82a38d86ebad 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1629,13 +1629,28 @@ xfs_refcount_recover_cow_leftovers( if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) return -EOPNOTSUPP; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + INIT_LIST_HEAD(&debris); + + /* + * In this first part, we use an empty transaction to gather up + * all the leftover CoW extents so that we can subsequently + * delete them. The empty transaction is used to avoid + * a buffer lock deadlock if there happens to be a loop in the + * refcountbt because we're allowed to re-grab a buffer that is + * already attached to our transaction. When we're done + * recording the CoW debris we cancel the (empty) transaction + * and everything goes away cleanly. + */ + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ - INIT_LIST_HEAD(&debris); memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); low.rc.rc_startblock = XFS_REFC_COW_START; @@ -1645,10 +1660,11 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_cursor; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); + xfs_trans_cancel(tp); /* Now iterate the list to free the leftovers */ - list_for_each_entry(rr, &debris, rr_list) { + list_for_each_entry_safe(rr, n, &debris, rr_list) { /* Set up transaction. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) @@ -1676,8 +1692,16 @@ xfs_refcount_recover_cow_leftovers( error = xfs_trans_commit(tp); if (error) goto out_free; + + list_del(&rr->rr_list); + kmem_free(rr); } + return error; +out_defer: + xfs_defer_cancel(&dfops); +out_trans: + xfs_trans_cancel(tp); out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { @@ -1688,11 +1712,6 @@ out_free: out_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_buf_relse(agbp); - goto out_free; - -out_defer: - xfs_defer_cancel(&dfops); - xfs_trans_cancel(tp); - goto out_free; + xfs_trans_brelse(tp, agbp); + goto out_trans; } From 75c5afd58d4696b4dce6ed931c49da9975962f99 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 22 May 2017 19:54:10 -0700 Subject: [PATCH 104/235] xfs: fix unaligned access in xfs_btree_visit_blocks commit a4d768e702de224cc85e0c8eac9311763403b368 upstream. This structure copy was throwing unaligned access warnings on sparc64: Kernel unaligned access at TPC[1043c088] xfs_btree_visit_blocks+0x88/0xe0 [xfs] xfs_btree_copy_ptrs does a memcpy, which avoids it. Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2849d3fa3d0b..91c68913d495 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4376,7 +4376,7 @@ xfs_btree_visit_blocks( xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ - lptr = *ptr; + xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); } /* for each buffer in the level */ From 11214bd292ec95c52f636c187b15c9160215f988 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:23 -0700 Subject: [PATCH 105/235] xfs: Fix off-by-in in loop termination in xfs_find_get_desired_pgoff() commit d7fd24257aa60316bf81093f7f909dc9475ae974 upstream. There is an off-by-one error in loop termination conditions in xfs_find_get_desired_pgoff() since 'end' may index a page beyond end of desired range if 'endoff' is page aligned. It doesn't have any visible effects but still it is good to fix it. Signed-off-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9292a59efbfa..a90ec3fad69f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1130,7 +1130,7 @@ xfs_find_get_desired_pgoff( index = startoff >> PAGE_SHIFT; endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; do { int want; unsigned nr_pages; From f1aa865ae5d4608cbfbb02f42baa1ef5ed95fce2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 7 Jun 2017 12:08:04 +0200 Subject: [PATCH 106/235] Linux 4.9.31 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b78a45bcf9b1..3601995f63f9 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 30 +SUBLEVEL = 31 EXTRAVERSION = NAME = Roaring Lionus From 3d7ac4ff57119a86e98af3fded4776bc36bbb1d6 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 7 Jun 2017 12:44:50 -0700 Subject: [PATCH 107/235] ANDROID: sdcardfs: d_splice_alias can return error values We must check that d_splice_alias was successful before using its output. Signed-off-by: Daniel Rosenberg Bug: 62390017 Change-Id: Ifda0a052fb3f67e35c635a4e5e907876c5400978 --- fs/sdcardfs/lookup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 17761c546617..843fcd216116 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -199,7 +199,8 @@ static struct dentry *__sdcardfs_interpose(struct dentry *dentry, ret_dentry = d_splice_alias(inode, dentry); dentry = ret_dentry ?: dentry; - update_derived_permission_lock(dentry); + if (!IS_ERR(dentry)) + update_derived_permission_lock(dentry); out: return ret_dentry; } From 7eed46b1d5bc6b45ad829f90bfde96616e9fa9f0 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 3 May 2017 10:04:48 -0400 Subject: [PATCH 108/235] UPSTREAM: drm/msm/mdp5: use __drm_atomic_helper_plane_duplicate_state() Somehow the helper was never retrofitted for mdp5. Which meant when plane_state->fence was added, it could get copied into new state in mdp5_plane_duplicate_state(). If an update to disable the plane (for example on rmfb) managed to sneak in after an nonblock update had swapped state, but before it was committed, we'd get a splat: WARNING: CPU: 1 PID: 69 at ../drivers/gpu/drm/drm_atomic_helper.c:1061 drm_atomic_helper_wait_for_fences+0xe0/0xf8 Modules linked in: CPU: 1 PID: 69 Comm: kworker/1:1 Tainted: G W 4.11.0-rc8+ #1187 Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT) Workqueue: events drm_mode_rmfb_work_fn task: ffffffc036560d00 task.stack: ffffffc036550000 PC is at drm_atomic_helper_wait_for_fences+0xe0/0xf8 LR is at complete_commit.isra.1+0x44/0x1c0 pc : [] lr : [] pstate: 20000145 sp : ffffffc036553b60 x29: ffffffc036553b60 x28: ffffffc0264e6a00 x27: ffffffc035659000 x26: 0000000000000000 x25: ffffffc0240e8000 x24: 0000000000000038 x23: 0000000000000000 x22: ffffff800858f200 x21: ffffffc0240e8000 x20: ffffffc02f56a800 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: ffffffc00a192700 x13: 0000000000000004 x12: 0000000000000000 x11: ffffff80089a1690 x10: 00000000000008f0 x9 : ffffffc036553b20 x8 : ffffffc036561650 x7 : ffffffc03fe6cb40 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000002 x3 : ffffffc035659000 x2 : ffffffc0240e8c80 x1 : 0000000000000000 x0 : ffffffc02adbe588 ---[ end trace 13aeec77c3fb55e2 ]--- Call trace: Exception stack(0xffffffc036553990 to 0xffffffc036553ac0) 3980: 0000000000000000 0000008000000000 39a0: ffffffc036553b60 ffffff80084f6040 0000000000004ff0 0000000000000038 39c0: ffffffc0365539d0 ffffff800857e098 ffffffc036553a00 ffffff800857e1b0 39e0: ffffffc036553a10 ffffff800857c554 ffffffc0365e8400 ffffffc0365e8400 3a00: ffffffc036553a20 ffffff8008103358 000000000001aad7 ffffff800851b72c 3a20: ffffffc036553a50 ffffff80080e9228 ffffffc02adbe588 0000000000000000 3a40: ffffffc0240e8c80 ffffffc035659000 0000000000000002 0000000000000001 3a60: 0000000000000000 ffffffc03fe6cb40 ffffffc036561650 ffffffc036553b20 3a80: 00000000000008f0 ffffff80089a1690 0000000000000000 0000000000000004 3aa0: ffffffc00a192700 0000000000000000 0000000000000000 0000000000000000 [] drm_atomic_helper_wait_for_fences+0xe0/0xf8 [] complete_commit.isra.1+0x44/0x1c0 [] msm_atomic_commit+0x32c/0x350 [] drm_atomic_commit+0x50/0x60 [] drm_atomic_remove_fb+0x158/0x250 [] drm_framebuffer_remove+0x50/0x158 [] drm_mode_rmfb_work_fn+0x40/0x58 [] process_one_work+0x1d0/0x378 [] worker_thread+0x244/0x488 [] kthread+0xfc/0x128 [] ret_from_fork+0x10/0x50 Fixes: 9626014 ("drm/fence: add in-fences support") Cc: stable@vger.kernel.org Reviewed-by: Daniel Vetter Reported-by: Stanimir Varbanov Signed-off-by: Rob Clark Fixes: Change-Id: I1a4e7395bc7709b457544c5086611122d4602a61 ("BACKPORT: drm/fence: add in-fences support") (cherry picked from commit 786813c343cb619d23cb0990e152e350b826d810) Signed-off-by: Amit Pundir --- drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c index 83bf997dda03..5e67e8b2b685 100644 --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c @@ -218,9 +218,10 @@ mdp5_plane_duplicate_state(struct drm_plane *plane) mdp5_state = kmemdup(to_mdp5_plane_state(plane->state), sizeof(*mdp5_state), GFP_KERNEL); + if (!mdp5_state) + return NULL; - if (mdp5_state && mdp5_state->base.fb) - drm_framebuffer_reference(mdp5_state->base.fb); + __drm_atomic_helper_plane_duplicate_state(plane, &mdp5_state->base); mdp5_state->mode_changed = false; mdp5_state->pending = false; From 96d145216b5884c3ba3003b2ccd6eb2da625a77d Mon Sep 17 00:00:00 2001 From: "Mintz, Yuval" Date: Thu, 1 Jun 2017 15:57:56 +0300 Subject: [PATCH 109/235] bnx2x: Fix Multi-Cos [ Upstream commit 3968d38917eb9bd0cd391265f6c9c538d9b33ffa ] Apparently multi-cos isn't working for bnx2x quite some time - driver implements ndo_select_queue() to allow queue-selection for FCoE, but the regular L2 flow would cause it to modulo the fallback's result by the number of queues. The fallback would return a queue matching the needed tc [via __skb_tx_hash()], but since the modulo is by the number of TSS queues where number of TCs is not accounted, transmission would always be done by a queue configured into using TC0. Fixes: ada7c19e6d27 ("bnx2x: use XPS if possible for bnx2x_select_queue instead of pure hash") Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 0a9108cd4c45..0a5ee1d973ac 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1931,7 +1931,7 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, } /* select a non-FCoE queue */ - return fallback(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp); + return fallback(dev, skb) % (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos); } void bnx2x_set_num_queues(struct bnx2x *bp) From b5e9b7ad0dd449b9296e16f4c0da9d4bb3d36d6d Mon Sep 17 00:00:00 2001 From: Lance Richardson Date: Mon, 29 May 2017 13:25:57 -0400 Subject: [PATCH 110/235] vxlan: eliminate cached dst leak [ Upstream commit 35cf2845563c1aaa01d27bd34d64795c4ae72700 ] After commit 0c1d70af924b ("net: use dst_cache for vxlan device"), cached dst entries could be leaked when more than one remote was present for a given vxlan_fdb entry, causing subsequent netns operations to block indefinitely and "unregister_netdevice: waiting for lo to become free." messages to appear in the kernel log. Fix by properly releasing cached dst and freeing resources in this case. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Signed-off-by: Lance Richardson Acked-by: Paolo Abeni Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/vxlan.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3c4c2cf6d444..3331b5e5b4ce 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -717,6 +717,22 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) call_rcu(&f->rcu, vxlan_fdb_free); } +static void vxlan_dst_free(struct rcu_head *head) +{ + struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); + + dst_cache_destroy(&rd->dst_cache); + kfree(rd); +} + +static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, + struct vxlan_rdst *rd) +{ + list_del_rcu(&rd->list); + vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH); + call_rcu(&rd->rcu, vxlan_dst_free); +} + static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *vni, u32 *ifindex) @@ -847,9 +863,7 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], * otherwise destroy the fdb entry */ if (rd && !list_is_singular(&f->remotes)) { - list_del_rcu(&rd->list); - vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH); - kfree_rcu(rd, rcu); + vxlan_fdb_dst_destroy(vxlan, f, rd); goto out; } From abbcb731d69cfa7fbcd7b77514ae04764827fdbf Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 31 May 2017 13:15:41 +0100 Subject: [PATCH 111/235] ipv6: xfrm: Handle errors reported by xfrm6_find_1stfragopt() [ Upstream commit 6e80ac5cc992ab6256c3dae87f7e57db15e1a58c ] xfrm6_find_1stfragopt() may now return an error code and we must not treat it as a length. Fixes: 2423496af35d ("ipv6: Prevent overrun when parsing v6 header options") Signed-off-by: Ben Hutchings Acked-by: Craig Gallek Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/xfrm6_mode_ro.c | 2 ++ net/ipv6/xfrm6_mode_transport.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c index 0e015906f9ca..07d36573f50b 100644 --- a/net/ipv6/xfrm6_mode_ro.c +++ b/net/ipv6/xfrm6_mode_ro.c @@ -47,6 +47,8 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) iph = ipv6_hdr(skb); hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + if (hdr_len < 0) + return hdr_len; skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); skb_set_network_header(skb, -x->props.header_len); skb->transport_header = skb->network_header + hdr_len; diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 4e344105b3fd..1d3bbe6e1183 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -28,6 +28,8 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) iph = ipv6_hdr(skb); hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + if (hdr_len < 0) + return hdr_len; skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); skb_set_network_header(skb, -x->props.header_len); skb->transport_header = skb->network_header + hdr_len; From 61c92d5a533c884d93b129cd4ad412a9128db9b7 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Wed, 31 May 2017 18:26:28 +0530 Subject: [PATCH 112/235] cxgb4: avoid enabling napi twice to the same queue [ Upstream commit e7519f9926f1d0d11c776eb0475eb098c7760f68 ] Take uld mutex to avoid race between cxgb_up() and cxgb4_register_uld() to enable napi for the same uld queue. Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 19dc9e25aa72..f9c2feb4a4e7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -2226,10 +2226,14 @@ static int cxgb_up(struct adapter *adap) if (err) goto irq_err; } + + mutex_lock(&uld_mutex); enable_rx(adap); t4_sge_start(adap); t4_intr_enable(adap); adap->flags |= FULL_INIT_DONE; + mutex_unlock(&uld_mutex); + notify_ulds(adap, CXGB4_STATE_UP); #if IS_ENABLED(CONFIG_IPV6) update_clip(adap); From 3ee35b96825e063defe6eb333ffd3124aab5061b Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 31 May 2017 11:21:27 -0700 Subject: [PATCH 113/235] tcp: disallow cwnd undo when switching congestion control [ Upstream commit 44abafc4cc094214a99f860f778c48ecb23422fc ] When the sender switches its congestion control during loss recovery, if the recovery is spurious then it may incorrectly revert cwnd and ssthresh to the older values set by a previous congestion control. Consider a congestion control (like BBR) that does not use ssthresh and keeps it infinite: the connection may incorrectly revert cwnd to an infinite value when switching from BBR to another congestion control. This patch fixes it by disallowing such cwnd undo operation upon switching congestion control. Note that undo_marker is not reset s.t. the packets that were incorrectly marked lost would be corrected. We only avoid undoing the cwnd in tcp_undo_cwnd_reduction(). Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_cong.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index baea5df43598..0cdbea9b9288 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -179,6 +179,7 @@ void tcp_init_congestion_control(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) From c242e1a8140e56ff0f673ff82c4c5db3df7d4d4a Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Fri, 2 Jun 2017 03:24:08 +0300 Subject: [PATCH 114/235] vxlan: fix use-after-free on deletion [ Upstream commit a53cb29b0af346af44e4abf13d7e59f807fba690 ] Adding a vxlan interface to a socket isn't symmetrical, while adding is done in vxlan_open() the deletion is done in vxlan_dellink(). This can cause a use-after-free error when we close the vxlan interface before deleting it. We add vxlan_vs_del_dev() to match vxlan_vs_add_dev() and call it from vxlan_stop() to match the call from vxlan_open(). Fixes: 56ef9c909b40 ("vxlan: Move socket initialization to within rtnl scope") Acked-by: Jiri Benc Tested-by: Roi Dayan Signed-off-by: Mark Bloch Acked-by: Roopa Prabhu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/vxlan.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3331b5e5b4ce..55c4408892be 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -59,6 +59,8 @@ static const u8 all_zeros_mac[ETH_ALEN + 2]; static int vxlan_sock_add(struct vxlan_dev *vxlan); +static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); + /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; @@ -1040,6 +1042,8 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan) rcu_assign_pointer(vxlan->vn4_sock, NULL); synchronize_net(); + vxlan_vs_del_dev(vxlan); + if (__vxlan_sock_release_prep(sock4)) { udp_tunnel_sock_release(sock4->sock); kfree(sock4); @@ -2300,6 +2304,15 @@ static void vxlan_cleanup(unsigned long arg) mod_timer(&vxlan->age_timer, next_timer); } +static void vxlan_vs_del_dev(struct vxlan_dev *vxlan) +{ + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + + spin_lock(&vn->sock_lock); + hlist_del_init_rcu(&vxlan->hlist); + spin_unlock(&vn->sock_lock); +} + static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); @@ -3070,12 +3083,6 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - - spin_lock(&vn->sock_lock); - if (!hlist_unhashed(&vxlan->hlist)) - hlist_del_rcu(&vxlan->hlist); - spin_unlock(&vn->sock_lock); gro_cells_destroy(&vxlan->gro_cells); list_del(&vxlan->next); From 599a4478d8cb7ad19bd946c878d59860c5df7875 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 4 Jun 2017 21:41:10 -0400 Subject: [PATCH 115/235] ipv6: Fix leak in ipv6_gso_segment(). [ Upstream commit e3e86b5119f81e5e2499bea7ea1ebe8ac6aab789 ] If ip6_find_1stfragopt() fails and we return an error we have to free up 'segs' because nobody else is going to. Fixes: 2423496af35d ("ipv6: Prevent overrun when parsing v6 header options") Reported-by: Ben Hutchings Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_offload.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 013086b248e2..424fbe1f8978 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -116,8 +116,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); - if (err < 0) + if (err < 0) { + kfree_skb_list(segs); return ERR_PTR(err); + } fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) From 0aa89f1b07de1e8a54f9073b0a48266e864c647e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 3 Jun 2017 09:29:25 -0700 Subject: [PATCH 116/235] net: ping: do not abuse udp_poll() [ Upstream commit 77d4b1d36926a9b8387c6b53eeba42bcaaffcea3 ] Alexander reported various KASAN messages triggered in recent kernels The problem is that ping sockets should not use udp_poll() in the first place, and recent changes in UDP stack finally exposed this old bug. Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Fixes: 6d0bfe226116 ("net: ipv6: Add IPv6 support to the ping socket.") Signed-off-by: Eric Dumazet Reported-by: Sasha Levin Cc: Solar Designer Cc: Vasiliy Kulikov Cc: Lorenzo Colitti Acked-By: Lorenzo Colitti Tested-By: Lorenzo Colitti Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ipv6.h | 1 + net/ipv4/af_inet.c | 2 +- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7f15f95625e7..91afb4aadaa6 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1001,6 +1001,7 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, */ extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; +extern const struct proto_ops inet6_sockraw_ops; struct group_source_req; struct group_filter; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 971b9471d427..f60fe82c2c1e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1015,7 +1015,7 @@ static struct inet_protosw inetsw_array[] = .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, - .ops = &inet_dgram_ops, + .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, }, diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 66e2d9dfc43a..982868193dbb 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -198,7 +198,7 @@ static struct inet_protosw pingv6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMPV6, .prot = &pingv6_prot, - .ops = &inet6_dgram_ops, + .ops = &inet6_sockraw_ops, .flags = INET_PROTOSW_REUSE, }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 1a2fe5c3a366..71ffa526cb23 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1330,7 +1330,7 @@ void raw6_proc_exit(void) #endif /* CONFIG_PROC_FS */ /* Same as inet6_dgram_ops, sans udp_poll. */ -static const struct proto_ops inet6_sockraw_ops = { +const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, From a97f807363d4f3c8c849a0e12064305d5bb5a6dc Mon Sep 17 00:00:00 2001 From: Richard Haines Date: Mon, 5 Jun 2017 16:44:40 +0100 Subject: [PATCH 117/235] net/ipv6: Fix CALIPSO causing GPF with datagram support [ Upstream commit e3ebdb20fddacded2740a333ff66781e0d28b05c ] When using CALIPSO with IPPROTO_UDP it is possible to trigger a GPF as the IP header may have moved. Also update the payload length after adding the CALIPSO option. Signed-off-by: Richard Haines Acked-by: Paul Moore Signed-off-by: Huw Davies Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/calipso.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 37ac9de713c6..8d772fea1dde 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -1319,7 +1319,7 @@ static int calipso_skbuff_setattr(struct sk_buff *skb, struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *hop; unsigned char buf[CALIPSO_MAX_BUFFER]; - int len_delta, new_end, pad; + int len_delta, new_end, pad, payload; unsigned int start, end; ip6_hdr = ipv6_hdr(skb); @@ -1346,6 +1346,8 @@ static int calipso_skbuff_setattr(struct sk_buff *skb, if (ret_val < 0) return ret_val; + ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */ + if (len_delta) { if (len_delta > 0) skb_push(skb, len_delta); @@ -1355,6 +1357,8 @@ static int calipso_skbuff_setattr(struct sk_buff *skb, sizeof(*ip6_hdr) + start); skb_reset_network_header(skb); ip6_hdr = ipv6_hdr(skb); + payload = ntohs(ip6_hdr->payload_len); + ip6_hdr->payload_len = htons(payload + len_delta); } hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); From a83564d128637c99d454cc1918b69ca84b9ce688 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 5 Jun 2017 18:31:16 -0700 Subject: [PATCH 118/235] net: ethoc: enable NAPI before poll may be scheduled [ Upstream commit d220b942a4b6a0640aee78841608f4aa5e8e185e ] ethoc_reset enables device interrupts, ethoc_interrupt may schedule a NAPI poll before NAPI is enabled in the ethoc_open, which results in device being unable to send or receive anything until it's closed and reopened. In case the device is flooded with ingress packets it may be unable to recover at all. Move napi_enable above ethoc_reset in the ethoc_open to fix that. Fixes: a1702857724f ("net: Add support for the OpenCores 10/100 Mbps Ethernet MAC.") Signed-off-by: Max Filippov Reviewed-by: Tobias Klauser Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/ethoc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index c044667a0a25..e31199f3048c 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -710,6 +710,8 @@ static int ethoc_open(struct net_device *dev) if (ret) return ret; + napi_enable(&priv->napi); + ethoc_init_ring(priv, dev->mem_start); ethoc_reset(priv); @@ -722,7 +724,6 @@ static int ethoc_open(struct net_device *dev) } phy_start(dev->phydev); - napi_enable(&priv->napi); if (netif_msg_ifup(priv)) { dev_info(&dev->dev, "I/O: %08lx Memory: %08lx-%08lx\n", From 3dd4daf112fd0c29e35b5f08af25ad93920e61aa Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 6 Jun 2017 09:25:00 +0200 Subject: [PATCH 119/235] net: stmmac: fix completely hung TX when using TSO [ Upstream commit 426849e6611f2092553f8d53372ae310818a6292 ] stmmac_tso_allocator can fail to set the Last Descriptor bit on a descriptor that actually was the last descriptor. This happens when the buffer of the last descriptor ends up having a size of exactly TSO_MAX_BUFF_SIZE. When the IP eventually reaches the next last descriptor, which actually has the bit set, the DMA will hang. When the DMA hangs, we get a tx timeout, however, since stmmac does not do a complete reset of the IP in stmmac_tx_timeout, we end up in a state with completely hung TX. Signed-off-by: Niklas Cassel Acked-by: Giuseppe Cavallaro Acked-by: Alexandre TORGUE Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b2893fbe25e5..ef6bff820cf6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1953,7 +1953,7 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, unsigned int des, priv->hw->desc->prepare_tso_tx_desc(desc, 0, buff_size, 0, 1, - (last_segment) && (buff_size < TSO_MAX_BUFF_SIZE), + (last_segment) && (tmp_len <= TSO_MAX_BUFF_SIZE), 0, 0); tmp_len -= TSO_MAX_BUFF_SIZE; From 0255284edddcfbba86c8997da2836a9bafc0354a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 1 Jun 2017 18:07:55 +0300 Subject: [PATCH 120/235] net: bridge: start hello timer only if device is up [ Upstream commit aeb073241fe7a2b932e04e20c60e47718332877f ] When the transition of NO_STP -> KERNEL_STP was fixed by always calling mod_timer in br_stp_start, it introduced a new regression which causes the timer to be armed even when the bridge is down, and since we stop the timers in its ndo_stop() function, they never get disabled if the device is destroyed before it's upped. To reproduce: $ while :; do ip l add br0 type bridge hello_time 100; brctl stp br0 on; ip l del br0; done; CC: Xin Long CC: Ivan Vecera CC: Sebastian Ott Reported-by: Sebastian Ott Fixes: 6d18c732b95c ("bridge: start hello_timer when enabling KERNEL_STP in br_stp_start") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_stp_if.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 5a782f543aff..16b5aa9a91f1 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -185,7 +185,8 @@ static void br_stp_start(struct net_bridge *br) br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ - mod_timer(&br->hello_timer, jiffies + br->hello_time); + if (br->dev->flags & IFF_UP) + mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } From 4b684e6474d0137d00c555b51f9bb4299f3e29bb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 5 Jun 2017 11:28:57 -0700 Subject: [PATCH 121/235] sparc64: Add __multi3 for gcc 7.x and later. [ Upstream commit 1b4af13ff2cc6897557bb0b8d9e2fad4fa4d67aa ] Reported-by: Waldemar Brodkorb Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/lib/Makefile | 1 + arch/sparc/lib/multi3.S | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 arch/sparc/lib/multi3.S diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 69912d2f8b54..07c03e72d812 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -15,6 +15,7 @@ lib-$(CONFIG_SPARC32) += copy_user.o locks.o lib-$(CONFIG_SPARC64) += atomic_64.o lib-$(CONFIG_SPARC32) += lshrdi3.o ashldi3.o lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o +lib-$(CONFIG_SPARC64) += multi3.o lib-$(CONFIG_SPARC64) += copy_page.o clear_page.o bzero.o lib-$(CONFIG_SPARC64) += csum_copy.o csum_copy_from_user.o csum_copy_to_user.o diff --git a/arch/sparc/lib/multi3.S b/arch/sparc/lib/multi3.S new file mode 100644 index 000000000000..d6b6c97fe3c7 --- /dev/null +++ b/arch/sparc/lib/multi3.S @@ -0,0 +1,35 @@ +#include +#include + + .text + .align 4 +ENTRY(__multi3) /* %o0 = u, %o1 = v */ + mov %o1, %g1 + srl %o3, 0, %g4 + mulx %g4, %g1, %o1 + srlx %g1, 0x20, %g3 + mulx %g3, %g4, %g5 + sllx %g5, 0x20, %o5 + srl %g1, 0, %g4 + sub %o1, %o5, %o5 + srlx %o5, 0x20, %o5 + addcc %g5, %o5, %g5 + srlx %o3, 0x20, %o5 + mulx %g4, %o5, %g4 + mulx %g3, %o5, %o5 + sethi %hi(0x80000000), %g3 + addcc %g5, %g4, %g5 + srlx %g5, 0x20, %g5 + add %g3, %g3, %g3 + movcc %xcc, %g0, %g3 + addcc %o5, %g5, %o5 + sllx %g4, 0x20, %g4 + add %o1, %g4, %o1 + add %o5, %g3, %g2 + mulx %g1, %o2, %g1 + add %g1, %g2, %g1 + mulx %o0, %o3, %o0 + retl + add %g1, %o0, %o0 +ENDPROC(__multi3) +EXPORT_SYMBOL(__multi3) From 8d665e039e6621aa9449f22d74f81ca803a748c9 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 2 Jun 2017 14:51:12 -0700 Subject: [PATCH 122/235] sparc64: mm: fix copy_tsb to correctly copy huge page TSBs [ Upstream commit 654f4807624a657f364417c2a7454f0df9961734 ] When a TSB grows beyond its current capacity, a new TSB is allocated and copy_tsb is called to copy entries from the old TSB to the new. A hash shift based on page size is used to calculate the index of an entry in the TSB. copy_tsb has hard coded PAGE_SHIFT in these calculations. However, for huge page TSBs the value REAL_HPAGE_SHIFT should be used. As a result, when copy_tsb is called for a huge page TSB the entries are placed at the incorrect index in the newly allocated TSB. When doing hardware table walk, the MMU does not match these entries and we end up in the TSB miss handling code. This code will then create and write an entry to the correct index in the TSB. We take a performance hit for the table walk miss and recreation of these entries. Pass a new parameter to copy_tsb that is the page size shift to be used when copying the TSB. Suggested-by: Anthony Yznaga Signed-off-by: Mike Kravetz Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/kernel/tsb.S | 11 +++++++---- arch/sparc/mm/tsb.c | 7 +++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S index d568c8207af7..395ec1800530 100644 --- a/arch/sparc/kernel/tsb.S +++ b/arch/sparc/kernel/tsb.S @@ -470,13 +470,16 @@ __tsb_context_switch: .type copy_tsb,#function copy_tsb: /* %o0=old_tsb_base, %o1=old_tsb_size * %o2=new_tsb_base, %o3=new_tsb_size + * %o4=page_size_shift */ sethi %uhi(TSB_PASS_BITS), %g7 srlx %o3, 4, %o3 - add %o0, %o1, %g1 /* end of old tsb */ + add %o0, %o1, %o1 /* end of old tsb */ sllx %g7, 32, %g7 sub %o3, 1, %o3 /* %o3 == new tsb hash mask */ + mov %o4, %g1 /* page_size_shift */ + 661: prefetcha [%o0] ASI_N, #one_read .section .tsb_phys_patch, "ax" .word 661b @@ -501,9 +504,9 @@ copy_tsb: /* %o0=old_tsb_base, %o1=old_tsb_size /* This can definitely be computed faster... */ srlx %o0, 4, %o5 /* Build index */ and %o5, 511, %o5 /* Mask index */ - sllx %o5, PAGE_SHIFT, %o5 /* Put into vaddr position */ + sllx %o5, %g1, %o5 /* Put into vaddr position */ or %o4, %o5, %o4 /* Full VADDR. */ - srlx %o4, PAGE_SHIFT, %o4 /* Shift down to create index */ + srlx %o4, %g1, %o4 /* Shift down to create index */ and %o4, %o3, %o4 /* Mask with new_tsb_nents-1 */ sllx %o4, 4, %o4 /* Shift back up into tsb ent offset */ TSB_STORE(%o2 + %o4, %g2) /* Store TAG */ @@ -511,7 +514,7 @@ copy_tsb: /* %o0=old_tsb_base, %o1=old_tsb_size TSB_STORE(%o2 + %o4, %g3) /* Store TTE */ 80: add %o0, 16, %o0 - cmp %o0, %g1 + cmp %o0, %o1 bne,pt %xcc, 90b nop diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index e20fbbafb0b0..84cd593117a6 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -451,7 +451,8 @@ retry_tsb_alloc: extern void copy_tsb(unsigned long old_tsb_base, unsigned long old_tsb_size, unsigned long new_tsb_base, - unsigned long new_tsb_size); + unsigned long new_tsb_size, + unsigned long page_size_shift); unsigned long old_tsb_base = (unsigned long) old_tsb; unsigned long new_tsb_base = (unsigned long) new_tsb; @@ -459,7 +460,9 @@ retry_tsb_alloc: old_tsb_base = __pa(old_tsb_base); new_tsb_base = __pa(new_tsb_base); } - copy_tsb(old_tsb_base, old_size, new_tsb_base, new_size); + copy_tsb(old_tsb_base, old_size, new_tsb_base, new_size, + tsb_index == MM_TSB_BASE ? + PAGE_SHIFT : REAL_HPAGE_SHIFT); } mm->context.tsb_block[tsb_index].tsb = new_tsb; From b3ad7a3e57b3c1af5c15a16b3f6c05f943035f1d Mon Sep 17 00:00:00 2001 From: James Clarke Date: Mon, 29 May 2017 20:17:56 +0100 Subject: [PATCH 123/235] sparc: Machine description indices can vary [ Upstream commit c982aa9c304bf0b9a7522fd118fed4afa5a0263c ] VIO devices were being looked up by their index in the machine description node block, but this often varies over time as devices are added and removed. Instead, store the ID and look up using the type, config handle and ID. Signed-off-by: James Clarke Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=112541 Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/vio.h | 1 + arch/sparc/kernel/vio.c | 68 +++++++++++++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/asm/vio.h b/arch/sparc/include/asm/vio.h index 8174f6cdbbbb..9dca7a892978 100644 --- a/arch/sparc/include/asm/vio.h +++ b/arch/sparc/include/asm/vio.h @@ -327,6 +327,7 @@ struct vio_dev { int compat_len; u64 dev_no; + u64 id; unsigned long channel_id; diff --git a/arch/sparc/kernel/vio.c b/arch/sparc/kernel/vio.c index f6bb857254fc..075d38980dee 100644 --- a/arch/sparc/kernel/vio.c +++ b/arch/sparc/kernel/vio.c @@ -302,13 +302,16 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp, if (!id) { dev_set_name(&vdev->dev, "%s", bus_id_name); vdev->dev_no = ~(u64)0; + vdev->id = ~(u64)0; } else if (!cfg_handle) { dev_set_name(&vdev->dev, "%s-%llu", bus_id_name, *id); vdev->dev_no = *id; + vdev->id = ~(u64)0; } else { dev_set_name(&vdev->dev, "%s-%llu-%llu", bus_id_name, *cfg_handle, *id); vdev->dev_no = *cfg_handle; + vdev->id = *id; } vdev->dev.parent = parent; @@ -351,27 +354,84 @@ static void vio_add(struct mdesc_handle *hp, u64 node) (void) vio_create_one(hp, node, &root_vdev->dev); } +struct vio_md_node_query { + const char *type; + u64 dev_no; + u64 id; +}; + static int vio_md_node_match(struct device *dev, void *arg) { + struct vio_md_node_query *query = (struct vio_md_node_query *) arg; struct vio_dev *vdev = to_vio_dev(dev); - if (vdev->mp == (u64) arg) - return 1; + if (vdev->dev_no != query->dev_no) + return 0; + if (vdev->id != query->id) + return 0; + if (strcmp(vdev->type, query->type)) + return 0; - return 0; + return 1; } static void vio_remove(struct mdesc_handle *hp, u64 node) { + const char *type; + const u64 *id, *cfg_handle; + u64 a; + struct vio_md_node_query query; struct device *dev; - dev = device_find_child(&root_vdev->dev, (void *) node, + type = mdesc_get_property(hp, node, "device-type", NULL); + if (!type) { + type = mdesc_get_property(hp, node, "name", NULL); + if (!type) + type = mdesc_node_name(hp, node); + } + + query.type = type; + + id = mdesc_get_property(hp, node, "id", NULL); + cfg_handle = NULL; + mdesc_for_each_arc(a, hp, node, MDESC_ARC_TYPE_BACK) { + u64 target; + + target = mdesc_arc_target(hp, a); + cfg_handle = mdesc_get_property(hp, target, + "cfg-handle", NULL); + if (cfg_handle) + break; + } + + if (!id) { + query.dev_no = ~(u64)0; + query.id = ~(u64)0; + } else if (!cfg_handle) { + query.dev_no = *id; + query.id = ~(u64)0; + } else { + query.dev_no = *cfg_handle; + query.id = *id; + } + + dev = device_find_child(&root_vdev->dev, &query, vio_md_node_match); if (dev) { printk(KERN_INFO "VIO: Removing device %s\n", dev_name(dev)); device_unregister(dev); put_device(dev); + } else { + if (!id) + printk(KERN_ERR "VIO: Removed unknown %s node.\n", + type); + else if (!cfg_handle) + printk(KERN_ERR "VIO: Removed unknown %s node %llu.\n", + type, *id); + else + printk(KERN_ERR "VIO: Removed unknown %s node %llu-%llu.\n", + type, *cfg_handle, *id); } } From e7590a1b15c816db318d941eee24111798d655fd Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 May 2017 11:25:20 -0400 Subject: [PATCH 124/235] sparc64: reset mm cpumask after wrap [ Upstream commit 588974857359861891f478a070b1dc7ae04a3880 ] After a wrap (getting a new context version) a process must get a new context id, which means that we would need to flush the context id from the TLB before running for the first time with this ID on every CPU. But, we use mm_cpumask to determine if this process has been running on this CPU before, and this mask is not reset after a wrap. So, there are two possible fixes for this issue: 1. Clear mm cpumask whenever mm gets a new context id 2. Unconditionally flush context every time process is running on a CPU This patch implements the first solution Signed-off-by: Pavel Tatashin Reviewed-by: Bob Picco Reviewed-by: Steven Sistare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/mm/init_64.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index bd7e2aa86c45..f9c0eee9577d 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -710,6 +710,8 @@ void get_new_mmu_context(struct mm_struct *mm) goto out; } } + if (mm->context.sparc64_ctx_val) + cpumask_clear(mm_cpumask(mm)); mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63)); new_ctx |= (tlb_context_cache & CTX_VERSION_MASK); out: From 65e3443b61a889b02e33537f4bd94efb23c31592 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 May 2017 11:25:21 -0400 Subject: [PATCH 125/235] sparc64: combine activate_mm and switch_mm [ Upstream commit 14d0334c6748ff2aedb3f2f7fdc51ee90a9b54e7 ] The only difference between these two functions is that in activate_mm we unconditionally flush context. However, there is no need to keep this difference after fixing a bug where cpumask was not reset on a wrap. So, in this patch we combine these. Signed-off-by: Pavel Tatashin Reviewed-by: Bob Picco Reviewed-by: Steven Sistare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/mmu_context_64.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index b84be675e507..266662a897ac 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -131,26 +131,7 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str } #define deactivate_mm(tsk,mm) do { } while (0) - -/* Activate a new MM instance for the current task. */ -static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm) -{ - unsigned long flags; - int cpu; - - spin_lock_irqsave(&mm->context.lock, flags); - if (!CTX_VALID(mm->context)) - get_new_mmu_context(mm); - cpu = smp_processor_id(); - if (!cpumask_test_cpu(cpu, mm_cpumask(mm))) - cpumask_set_cpu(cpu, mm_cpumask(mm)); - - load_secondary_context(mm); - __flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT); - tsb_context_switch(mm); - spin_unlock_irqrestore(&mm->context.lock, flags); -} - +#define activate_mm(active_mm, mm) switch_mm(active_mm, mm, NULL) #endif /* !(__ASSEMBLY__) */ #endif /* !(__SPARC64_MMU_CONTEXT_H) */ From 7932bfad0e826c03dfef488cda89143697c81dfe Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 May 2017 11:25:22 -0400 Subject: [PATCH 126/235] sparc64: redefine first version [ Upstream commit c4415235b2be0cc791572e8e7f7466ab8f73a2bf ] CTX_FIRST_VERSION defines the first context version, but also it defines first context. This patch redefines it to only include the first context version. Signed-off-by: Pavel Tatashin Reviewed-by: Bob Picco Reviewed-by: Steven Sistare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/mmu_64.h | 2 +- arch/sparc/mm/init_64.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h index f7de0dbc38af..83b36a5371ff 100644 --- a/arch/sparc/include/asm/mmu_64.h +++ b/arch/sparc/include/asm/mmu_64.h @@ -52,7 +52,7 @@ #define CTX_NR_MASK TAG_CONTEXT_BITS #define CTX_HW_MASK (CTX_NR_MASK | CTX_PGSZ_MASK) -#define CTX_FIRST_VERSION ((_AC(1,UL) << CTX_VERSION_SHIFT) + _AC(1,UL)) +#define CTX_FIRST_VERSION BIT(CTX_VERSION_SHIFT) #define CTX_VALID(__ctx) \ (!(((__ctx.sparc64_ctx_val) ^ tlb_context_cache) & CTX_VERSION_MASK)) #define CTX_HWBITS(__ctx) ((__ctx.sparc64_ctx_val) & CTX_HW_MASK) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index f9c0eee9577d..ccbaa34f11e1 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -658,7 +658,7 @@ EXPORT_SYMBOL(__flush_dcache_range); /* get_new_mmu_context() uses "cache + 1". */ DEFINE_SPINLOCK(ctx_alloc_lock); -unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1; +unsigned long tlb_context_cache = CTX_FIRST_VERSION; #define MAX_CTX_NR (1UL << CTX_NR_BITS) #define CTX_BMAP_SLOTS BITS_TO_LONGS(MAX_CTX_NR) DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR); @@ -689,9 +689,9 @@ void get_new_mmu_context(struct mm_struct *mm) if (new_ctx >= ctx) { int i; new_ctx = (tlb_context_cache & CTX_VERSION_MASK) + - CTX_FIRST_VERSION; + CTX_FIRST_VERSION + 1; if (new_ctx == 1) - new_ctx = CTX_FIRST_VERSION; + new_ctx = CTX_FIRST_VERSION + 1; /* Don't call memset, for 16 entries that's just * plain silly... From 975f3cdc39154ccf7f339185883ef55933b85a77 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 May 2017 11:25:23 -0400 Subject: [PATCH 127/235] sparc64: add per-cpu mm of secondary contexts [ Upstream commit 7a5b4bbf49fe86ce77488a70c5dccfe2d50d7a2d ] The new wrap is going to use information from this array to figure out mm's that currently have valid secondary contexts setup. Signed-off-by: Pavel Tatashin Reviewed-by: Bob Picco Reviewed-by: Steven Sistare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/mmu_context_64.h | 5 +++-- arch/sparc/mm/init_64.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index 266662a897ac..eb410865c8de 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -17,6 +17,7 @@ extern spinlock_t ctx_alloc_lock; extern unsigned long tlb_context_cache; extern unsigned long mmu_context_bmap[]; +DECLARE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm); void get_new_mmu_context(struct mm_struct *mm); #ifdef CONFIG_SMP void smp_new_mmu_context_version(void); @@ -74,8 +75,9 @@ void __flush_tlb_mm(unsigned long, unsigned long); static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, struct task_struct *tsk) { unsigned long ctx_valid, flags; - int cpu; + int cpu = smp_processor_id(); + per_cpu(per_cpu_secondary_mm, cpu) = mm; if (unlikely(mm == &init_mm)) return; @@ -121,7 +123,6 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str * for the first time, we must flush that context out of the * local TLB. */ - cpu = smp_processor_id(); if (!ctx_valid || !cpumask_test_cpu(cpu, mm_cpumask(mm))) { cpumask_set_cpu(cpu, mm_cpumask(mm)); __flush_tlb_mm(CTX_HWBITS(mm->context), diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index ccbaa34f11e1..d5cebdb37a4b 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -662,6 +662,7 @@ unsigned long tlb_context_cache = CTX_FIRST_VERSION; #define MAX_CTX_NR (1UL << CTX_NR_BITS) #define CTX_BMAP_SLOTS BITS_TO_LONGS(MAX_CTX_NR) DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR); +DEFINE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm) = {0}; /* Caller does TLB context flushing on local CPU if necessary. * The caller also ensures that CTX_VALID(mm->context) is false. From b6bb22de0c9ee64926af8509d4d32a5c4b26b5bd Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 May 2017 11:25:24 -0400 Subject: [PATCH 128/235] sparc64: new context wrap [ Upstream commit a0582f26ec9dfd5360ea2f35dd9a1b026f8adda0 ] The current wrap implementation has a race issue: it is called outside of the ctx_alloc_lock, and also does not wait for all CPUs to complete the wrap. This means that a thread can get a new context with a new version and another thread might still be running with the same context. The problem is especially severe on CPUs with shared TLBs, like sun4v. I used the following test to very quickly reproduce the problem: - start over 8K processes (must be more than context IDs) - write and read values at a memory location in every process. Very quickly memory corruptions start happening, and what we read back does not equal what we wrote. Several approaches were explored before settling on this one: Approach 1: Move smp_new_mmu_context_version() inside ctx_alloc_lock, and wait for every process to complete the wrap. (Note: every CPU must WAIT before leaving smp_new_mmu_context_version_client() until every one arrives). This approach ends up with deadlocks, as some threads own locks which other threads are waiting for, and they never receive softint until these threads exit smp_new_mmu_context_version_client(). Since we do not allow the exit, deadlock happens. Approach 2: Handle wrap right during mondo interrupt. Use etrap/rtrap to enter into into C code, and issue new versions to every CPU. This approach adds some overhead to runtime: in switch_mm() we must add some checks to make sure that versions have not changed due to wrap while we were loading the new secondary context. (could be protected by PSTATE_IE but that degrades performance as on M7 and older CPUs as it takes 50 cycles for each access). Also, we still need a global per-cpu array of MMs to know where we need to load new contexts, otherwise we can change context to a thread that is going way (if we received mondo between switch_mm() and switch_to() time). Finally, there are some issues with window registers in rtrap() when context IDs are changed during CPU mondo time. The approach in this patch is the simplest and has almost no impact on runtime. We use the array with mm's where last secondary contexts were loaded onto CPUs and bump their versions to the new generation without changing context IDs. If a new process comes in to get a context ID, it will go through get_new_mmu_context() because of version mismatch. But the running processes do not need to be interrupted. And wrap is quicker as we do not need to xcall and wait for everyone to receive and complete wrap. Signed-off-by: Pavel Tatashin Reviewed-by: Bob Picco Reviewed-by: Steven Sistare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/mm/init_64.c | 81 +++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index d5cebdb37a4b..57154c638e71 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -664,6 +664,53 @@ unsigned long tlb_context_cache = CTX_FIRST_VERSION; DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR); DEFINE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm) = {0}; +static void mmu_context_wrap(void) +{ + unsigned long old_ver = tlb_context_cache & CTX_VERSION_MASK; + unsigned long new_ver, new_ctx, old_ctx; + struct mm_struct *mm; + int cpu; + + bitmap_zero(mmu_context_bmap, 1 << CTX_NR_BITS); + + /* Reserve kernel context */ + set_bit(0, mmu_context_bmap); + + new_ver = (tlb_context_cache & CTX_VERSION_MASK) + CTX_FIRST_VERSION; + if (unlikely(new_ver == 0)) + new_ver = CTX_FIRST_VERSION; + tlb_context_cache = new_ver; + + /* + * Make sure that any new mm that are added into per_cpu_secondary_mm, + * are going to go through get_new_mmu_context() path. + */ + mb(); + + /* + * Updated versions to current on those CPUs that had valid secondary + * contexts + */ + for_each_online_cpu(cpu) { + /* + * If a new mm is stored after we took this mm from the array, + * it will go into get_new_mmu_context() path, because we + * already bumped the version in tlb_context_cache. + */ + mm = per_cpu(per_cpu_secondary_mm, cpu); + + if (unlikely(!mm || mm == &init_mm)) + continue; + + old_ctx = mm->context.sparc64_ctx_val; + if (likely((old_ctx & CTX_VERSION_MASK) == old_ver)) { + new_ctx = (old_ctx & ~CTX_VERSION_MASK) | new_ver; + set_bit(new_ctx & CTX_NR_MASK, mmu_context_bmap); + mm->context.sparc64_ctx_val = new_ctx; + } + } +} + /* Caller does TLB context flushing on local CPU if necessary. * The caller also ensures that CTX_VALID(mm->context) is false. * @@ -678,50 +725,30 @@ void get_new_mmu_context(struct mm_struct *mm) { unsigned long ctx, new_ctx; unsigned long orig_pgsz_bits; - int new_version; spin_lock(&ctx_alloc_lock); +retry: + /* wrap might have happened, test again if our context became valid */ + if (unlikely(CTX_VALID(mm->context))) + goto out; orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK); ctx = (tlb_context_cache + 1) & CTX_NR_MASK; new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx); - new_version = 0; if (new_ctx >= (1 << CTX_NR_BITS)) { new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1); if (new_ctx >= ctx) { - int i; - new_ctx = (tlb_context_cache & CTX_VERSION_MASK) + - CTX_FIRST_VERSION + 1; - if (new_ctx == 1) - new_ctx = CTX_FIRST_VERSION + 1; - - /* Don't call memset, for 16 entries that's just - * plain silly... - */ - mmu_context_bmap[0] = 3; - mmu_context_bmap[1] = 0; - mmu_context_bmap[2] = 0; - mmu_context_bmap[3] = 0; - for (i = 4; i < CTX_BMAP_SLOTS; i += 4) { - mmu_context_bmap[i + 0] = 0; - mmu_context_bmap[i + 1] = 0; - mmu_context_bmap[i + 2] = 0; - mmu_context_bmap[i + 3] = 0; - } - new_version = 1; - goto out; + mmu_context_wrap(); + goto retry; } } if (mm->context.sparc64_ctx_val) cpumask_clear(mm_cpumask(mm)); mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63)); new_ctx |= (tlb_context_cache & CTX_VERSION_MASK); -out: tlb_context_cache = new_ctx; mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits; +out: spin_unlock(&ctx_alloc_lock); - - if (unlikely(new_version)) - smp_new_mmu_context_version(); } static int numa_enabled = 1; From 433a50e681df28353d63bf91881cd2bf6a0ba172 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 May 2017 11:25:25 -0400 Subject: [PATCH 129/235] sparc64: delete old wrap code [ Upstream commit 0197e41ce70511dc3b71f7fefa1a676e2b5cd60b ] The old method that is using xcall and softint to get new context id is deleted, as it is replaced by a method of using per_cpu_secondary_mm without xcall to perform the context wrap. Signed-off-by: Pavel Tatashin Reviewed-by: Bob Picco Reviewed-by: Steven Sistare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/mmu_context_64.h | 6 ----- arch/sparc/include/asm/pil.h | 1 - arch/sparc/kernel/kernel.h | 1 - arch/sparc/kernel/smp_64.c | 31 ------------------------- arch/sparc/kernel/ttable_64.S | 2 +- arch/sparc/mm/ultra.S | 5 ---- 6 files changed, 1 insertion(+), 45 deletions(-) diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index eb410865c8de..349dd23e2876 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -19,12 +19,6 @@ extern unsigned long mmu_context_bmap[]; DECLARE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm); void get_new_mmu_context(struct mm_struct *mm); -#ifdef CONFIG_SMP -void smp_new_mmu_context_version(void); -#else -#define smp_new_mmu_context_version() do { } while (0) -#endif - int init_new_context(struct task_struct *tsk, struct mm_struct *mm); void destroy_context(struct mm_struct *mm); diff --git a/arch/sparc/include/asm/pil.h b/arch/sparc/include/asm/pil.h index 266937030546..522b43db2ed3 100644 --- a/arch/sparc/include/asm/pil.h +++ b/arch/sparc/include/asm/pil.h @@ -20,7 +20,6 @@ #define PIL_SMP_CALL_FUNC 1 #define PIL_SMP_RECEIVE_SIGNAL 2 #define PIL_SMP_CAPTURE 3 -#define PIL_SMP_CTX_NEW_VERSION 4 #define PIL_DEVICE_IRQ 5 #define PIL_SMP_CALL_FUNC_SNGL 6 #define PIL_DEFERRED_PCR_WORK 7 diff --git a/arch/sparc/kernel/kernel.h b/arch/sparc/kernel/kernel.h index c9804551262c..6ae1e77be0bf 100644 --- a/arch/sparc/kernel/kernel.h +++ b/arch/sparc/kernel/kernel.h @@ -37,7 +37,6 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr /* smp_64.c */ void __irq_entry smp_call_function_client(int irq, struct pt_regs *regs); void __irq_entry smp_call_function_single_client(int irq, struct pt_regs *regs); -void __irq_entry smp_new_mmu_context_version_client(int irq, struct pt_regs *regs); void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs); void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 8182f7caf5b1..d5807d24b98f 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -963,37 +963,6 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) preempt_enable(); } -void __irq_entry smp_new_mmu_context_version_client(int irq, struct pt_regs *regs) -{ - struct mm_struct *mm; - unsigned long flags; - - clear_softint(1 << irq); - - /* See if we need to allocate a new TLB context because - * the version of the one we are using is now out of date. - */ - mm = current->active_mm; - if (unlikely(!mm || (mm == &init_mm))) - return; - - spin_lock_irqsave(&mm->context.lock, flags); - - if (unlikely(!CTX_VALID(mm->context))) - get_new_mmu_context(mm); - - spin_unlock_irqrestore(&mm->context.lock, flags); - - load_secondary_context(mm); - __flush_tlb_mm(CTX_HWBITS(mm->context), - SECONDARY_CONTEXT); -} - -void smp_new_mmu_context_version(void) -{ - smp_cross_call(&xcall_new_mmu_context_version, 0, 0, 0); -} - #ifdef CONFIG_KGDB void kgdb_roundup_cpus(unsigned long flags) { diff --git a/arch/sparc/kernel/ttable_64.S b/arch/sparc/kernel/ttable_64.S index c6dfdaa29e20..170ead662f2a 100644 --- a/arch/sparc/kernel/ttable_64.S +++ b/arch/sparc/kernel/ttable_64.S @@ -50,7 +50,7 @@ tl0_resv03e: BTRAP(0x3e) BTRAP(0x3f) BTRAP(0x40) tl0_irq1: TRAP_IRQ(smp_call_function_client, 1) tl0_irq2: TRAP_IRQ(smp_receive_signal_client, 2) tl0_irq3: TRAP_IRQ(smp_penguin_jailcell, 3) -tl0_irq4: TRAP_IRQ(smp_new_mmu_context_version_client, 4) +tl0_irq4: BTRAP(0x44) #else tl0_irq1: BTRAP(0x41) tl0_irq2: BTRAP(0x42) diff --git a/arch/sparc/mm/ultra.S b/arch/sparc/mm/ultra.S index 5d2fd6cd3189..fcf4d27a38fb 100644 --- a/arch/sparc/mm/ultra.S +++ b/arch/sparc/mm/ultra.S @@ -971,11 +971,6 @@ xcall_capture: wr %g0, (1 << PIL_SMP_CAPTURE), %set_softint retry - .globl xcall_new_mmu_context_version -xcall_new_mmu_context_version: - wr %g0, (1 << PIL_SMP_CTX_NEW_VERSION), %set_softint - retry - #ifdef CONFIG_KGDB .globl xcall_kgdb_capture xcall_kgdb_capture: From 04ac452dadbf2eb8cc0fba82929e94c042f7ba38 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Tue, 6 Jun 2017 14:32:29 -0600 Subject: [PATCH 130/235] arch/sparc: support NR_CPUS = 4096 [ Upstream commit c79a13734d104b5b147d7cb0870276ccdd660dae ] Linux SPARC64 limits NR_CPUS to 4064 because init_cpu_send_mondo_info() only allocates a single page for NR_CPUS mondo entries. Thus we cannot use all 4096 CPUs on some SPARC platforms. To fix, allocate (2^order) pages where order is set according to the size of cpu_list for possible cpus. Since cpu_list_pa and cpu_mondo_block_pa are not used in asm code, there are no imm13 offsets from the base PA that will break because they can only reach one page. Orabug: 25505750 Signed-off-by: Jane Chu Reviewed-by: Bob Picco Reviewed-by: Atish Patra Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/Kconfig | 4 ++-- arch/sparc/kernel/irq_64.c | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 165ecdd24d22..b27e48e25841 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -187,9 +187,9 @@ config NR_CPUS int "Maximum number of CPUs" depends on SMP range 2 32 if SPARC32 - range 2 1024 if SPARC64 + range 2 4096 if SPARC64 default 32 if SPARC32 - default 64 if SPARC64 + default 4096 if SPARC64 source kernel/Kconfig.hz diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 34a7930b76ef..e1b1ce63a328 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -1034,17 +1034,26 @@ static void __init init_cpu_send_mondo_info(struct trap_per_cpu *tb) { #ifdef CONFIG_SMP unsigned long page; + void *mondo, *p; - BUILD_BUG_ON((NR_CPUS * sizeof(u16)) > (PAGE_SIZE - 64)); + BUILD_BUG_ON((NR_CPUS * sizeof(u16)) > PAGE_SIZE); + + /* Make sure mondo block is 64byte aligned */ + p = kzalloc(127, GFP_KERNEL); + if (!p) { + prom_printf("SUN4V: Error, cannot allocate mondo block.\n"); + prom_halt(); + } + mondo = (void *)(((unsigned long)p + 63) & ~0x3f); + tb->cpu_mondo_block_pa = __pa(mondo); page = get_zeroed_page(GFP_KERNEL); if (!page) { - prom_printf("SUN4V: Error, cannot allocate cpu mondo page.\n"); + prom_printf("SUN4V: Error, cannot allocate cpu list page.\n"); prom_halt(); } - tb->cpu_mondo_block_pa = __pa(page); - tb->cpu_list_pa = __pa(page + 64); + tb->cpu_list_pa = __pa(page); #endif } From 3802abc6e0dfa892d7205fac199b0e2d68cec056 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 26 Apr 2017 12:24:21 +0200 Subject: [PATCH 131/235] serial: ifx6x60: fix use-after-free on module unload commit 1e948479b3d63e3ac0ecca13cbf4921c7d17c168 upstream. Make sure to deregister the SPI driver before releasing the tty driver to avoid use-after-free in the SPI remove callback where the tty devices are deregistered. Fixes: 72d4724ea54c ("serial: ifx6x60: Add modem power off function in the platform reboot process") Cc: Jun Chen Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/ifx6x60.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c index d386346248de..91d2ddd6ef88 100644 --- a/drivers/tty/serial/ifx6x60.c +++ b/drivers/tty/serial/ifx6x60.c @@ -1381,9 +1381,9 @@ static struct spi_driver ifx_spi_driver = { static void __exit ifx_spi_exit(void) { /* unregister */ + spi_unregister_driver(&ifx_spi_driver); tty_unregister_driver(tty_drv); put_tty_driver(tty_drv); - spi_unregister_driver(&ifx_spi_driver); unregister_reboot_notifier(&ifx_modem_reboot_notifier_block); } From 7c24a70c70b7a1ff71cbf410358c6c45daccdc74 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 May 2017 15:40:12 -0500 Subject: [PATCH 132/235] ptrace: Properly initialize ptracer_cred on fork commit c70d9d809fdeecedb96972457ee45c49a232d97f upstream. When I introduced ptracer_cred I failed to consider the weirdness of fork where the task_struct copies the old value by default. This winds up leaving ptracer_cred set even when a process forks and the child process does not wind up being ptraced. Because ptracer_cred is not set on non-ptraced processes whose parents were ptraced this has broken the ability of the enlightenment window manager to start setuid children. Fix this by properly initializing ptracer_cred in ptrace_init_task This must be done with a little bit of care to preserve the current value of ptracer_cred when ptrace carries through fork. Re-reading the ptracer_cred from the ptracing process at this point is inconsistent with how PT_PTRACE_CAP has been maintained all of these years. Tested-by: Takashi Iwai Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- include/linux/ptrace.h | 7 +++++-- kernel/ptrace.c | 20 +++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index e0e539321ab9..d53a23100401 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -53,7 +53,8 @@ extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, - struct task_struct *new_parent); + struct task_struct *new_parent, + const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 @@ -205,7 +206,7 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; - __ptrace_link(child, current->parent); + __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); @@ -214,6 +215,8 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) set_tsk_thread_flag(child, TIF_SIGPENDING); } + else + child->ptracer_cred = NULL; } /** diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a5caecef88be..f39a7be98fc1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -57,19 +57,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, } +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -383,7 +389,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -456,7 +462,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); From 0e479742e8d195b57d63a5acbb53df04929d603c Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Thu, 18 May 2017 16:29:23 +0300 Subject: [PATCH 133/235] crypto: asymmetric_keys - handle EBUSY due to backlog correctly commit e68368aed56324e2e38d4f6b044bb8cf82077fc2 upstream. public_key_verify_signature() was passing the CRYPTO_TFM_REQ_MAY_BACKLOG flag to akcipher_request_set_callback() but was not handling correctly the case where a -EBUSY error could be returned from the call to crypto_akcipher_verify() if backlog was used, possibly casuing data corruption due to use-after-free of buffers. Resolve this by handling -EBUSY correctly. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/asymmetric_keys/public_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index fd76b5fc3b3a..4955eb66e361 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -140,7 +140,7 @@ int public_key_verify_signature(const struct public_key *pkey, * signature and returns that to us. */ ret = crypto_akcipher_verify(req); - if (ret == -EINPROGRESS) { + if ((ret == -EINPROGRESS) || (ret == -EBUSY)) { wait_for_completion(&compl.completion); ret = compl.err; } From 1b253e023f8f75b109564a61d2050d818f75b4f3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Jun 2017 14:48:40 +0100 Subject: [PATCH 134/235] KEYS: fix dereferencing NULL payload with nonzero length commit 5649645d725c73df4302428ee4e02c869248b4c5 upstream. sys_add_key() and the KEYCTL_UPDATE operation of sys_keyctl() allowed a NULL payload with nonzero length to be passed to the key type's ->preparse(), ->instantiate(), and/or ->update() methods. Various key types including asymmetric, cifs.idmap, cifs.spnego, and pkcs7_test did not handle this case, allowing an unprivileged user to trivially cause a NULL pointer dereference (kernel oops) if one of these key types was present. Fix it by doing the copy_from_user() when 'plen' is nonzero rather than when '_payload' is non-NULL, causing the syscall to fail with EFAULT as expected when an invalid buffer is specified. Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: James Morris Signed-off-by: Greg Kroah-Hartman --- security/keys/keyctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index dbbfd7735ce5..ada12c3e3ac4 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -97,7 +97,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, /* pull the payload in if one was supplied */ payload = NULL; - if (_payload) { + if (plen) { ret = -ENOMEM; payload = kmalloc(plen, GFP_KERNEL | __GFP_NOWARN); if (!payload) { @@ -327,7 +327,7 @@ long keyctl_update_key(key_serial_t id, /* pull the payload in if one was supplied */ payload = NULL; - if (_payload) { + if (plen) { ret = -ENOMEM; payload = kmalloc(plen, GFP_KERNEL); if (!payload) From 24369761029a791388cf197f4678f8ee37b2a72f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Jun 2017 14:48:47 +0100 Subject: [PATCH 135/235] KEYS: fix freeing uninitialized memory in key_update() commit 63a0b0509e700717a59f049ec6e4e04e903c7fe2 upstream. key_update() freed the key_preparsed_payload even if it was not initialized first. This would cause a crash if userspace called keyctl_update() on a key with type like "asymmetric" that has a ->preparse() method but not an ->update() method. Possibly it could even be triggered for other key types by racing with keyctl_setperm() to make the KEY_NEED_WRITE check fail (the permission was already checked, so normally it wouldn't fail there). Reproducer with key type "asymmetric", given a valid cert.der: keyctl new_session keyid=$(keyctl padd asymmetric desc @s < cert.der) keyctl setperm $keyid 0x3f000000 keyctl update $keyid data [ 150.686666] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001 [ 150.687601] IP: asymmetric_key_free_kids+0x12/0x30 [ 150.688139] PGD 38a3d067 [ 150.688141] PUD 3b3de067 [ 150.688447] PMD 0 [ 150.688745] [ 150.689160] Oops: 0000 [#1] SMP [ 150.689455] Modules linked in: [ 150.689769] CPU: 1 PID: 2478 Comm: keyctl Not tainted 4.11.0-rc4-xfstests-00187-ga9f6b6b8cd2f #742 [ 150.690916] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 [ 150.692199] task: ffff88003b30c480 task.stack: ffffc90000350000 [ 150.692952] RIP: 0010:asymmetric_key_free_kids+0x12/0x30 [ 150.693556] RSP: 0018:ffffc90000353e58 EFLAGS: 00010202 [ 150.694142] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000004 [ 150.694845] RDX: ffffffff81ee3920 RSI: ffff88003d4b0700 RDI: 0000000000000001 [ 150.697569] RBP: ffffc90000353e60 R08: ffff88003d5d2140 R09: 0000000000000000 [ 150.702483] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001 [ 150.707393] R13: 0000000000000004 R14: ffff880038a4d2d8 R15: 000000000040411f [ 150.709720] FS: 00007fcbcee35700(0000) GS:ffff88003fd00000(0000) knlGS:0000000000000000 [ 150.711504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 150.712733] CR2: 0000000000000001 CR3: 0000000039eab000 CR4: 00000000003406e0 [ 150.714487] Call Trace: [ 150.714975] asymmetric_key_free_preparse+0x2f/0x40 [ 150.715907] key_update+0xf7/0x140 [ 150.716560] ? key_default_cmp+0x20/0x20 [ 150.717319] keyctl_update_key+0xb0/0xe0 [ 150.718066] SyS_keyctl+0x109/0x130 [ 150.718663] entry_SYSCALL_64_fastpath+0x1f/0xc2 [ 150.719440] RIP: 0033:0x7fcbce75ff19 [ 150.719926] RSP: 002b:00007ffd5d167088 EFLAGS: 00000206 ORIG_RAX: 00000000000000fa [ 150.720918] RAX: ffffffffffffffda RBX: 0000000000404d80 RCX: 00007fcbce75ff19 [ 150.721874] RDX: 00007ffd5d16785e RSI: 000000002866cd36 RDI: 0000000000000002 [ 150.722827] RBP: 0000000000000006 R08: 000000002866cd36 R09: 00007ffd5d16785e [ 150.723781] R10: 0000000000000004 R11: 0000000000000206 R12: 0000000000404d80 [ 150.724650] R13: 00007ffd5d16784d R14: 00007ffd5d167238 R15: 000000000040411f [ 150.725447] Code: 83 c4 08 31 c0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 85 ff 74 23 55 48 89 e5 53 48 89 fb <48> 8b 3f e8 06 21 c5 ff 48 8b 7b 08 e8 fd 20 c5 ff 48 89 df e8 [ 150.727489] RIP: asymmetric_key_free_kids+0x12/0x30 RSP: ffffc90000353e58 [ 150.728117] CR2: 0000000000000001 [ 150.728430] ---[ end trace f7f8fe1da2d5ae8d ]--- Fixes: 4d8c0250b841 ("KEYS: Call ->free_preparse() even after ->preparse() returns an error") Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: James Morris Signed-off-by: Greg Kroah-Hartman --- security/keys/key.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/security/keys/key.c b/security/keys/key.c index 346fbf201c22..2f4ce35ae2aa 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -962,12 +962,11 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen) /* the key must be writable */ ret = key_permission(key_ref, KEY_NEED_WRITE); if (ret < 0) - goto error; + return ret; /* attempt to update it if supported */ - ret = -EOPNOTSUPP; if (!key->type->update) - goto error; + return -EOPNOTSUPP; memset(&prep, 0, sizeof(prep)); prep.data = payload; From d24c1c1977d80dcedf4e469be62f71644bf3383e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Jun 2017 14:48:10 +0100 Subject: [PATCH 136/235] KEYS: encrypted: avoid encrypting/decrypting stack buffers commit e9ff56ac352446f55141aaef1553cee662b2e310 upstream. Since v4.9, the crypto API cannot (normally) be used to encrypt/decrypt stack buffers because the stack may be virtually mapped. Fix this for the padding buffers in encrypted-keys by using ZERO_PAGE for the encryption padding and by allocating a temporary heap buffer for the decryption padding. Tested with CONFIG_DEBUG_SG=y: keyctl new_session keyctl add user master "abcdefghijklmnop" @s keyid=$(keyctl add encrypted desc "new user:master 25" @s) datablob="$(keyctl pipe $keyid)" keyctl unlink $keyid keyid=$(keyctl add encrypted desc "load $datablob" @s) datablob2="$(keyctl pipe $keyid)" [ "$datablob" = "$datablob2" ] && echo "Success!" Cc: Andy Lutomirski Cc: Herbert Xu Cc: Mimi Zohar Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: James Morris Signed-off-by: Greg Kroah-Hartman --- security/keys/encrypted-keys/encrypted.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 17a06105ccb6..56c458dd16a2 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -480,12 +480,9 @@ static int derived_key_encrypt(struct encrypted_key_payload *epayload, struct skcipher_request *req; unsigned int encrypted_datalen; u8 iv[AES_BLOCK_SIZE]; - unsigned int padlen; - char pad[16]; int ret; encrypted_datalen = roundup(epayload->decrypted_datalen, blksize); - padlen = encrypted_datalen - epayload->decrypted_datalen; req = init_skcipher_req(derived_key, derived_keylen); ret = PTR_ERR(req); @@ -493,11 +490,10 @@ static int derived_key_encrypt(struct encrypted_key_payload *epayload, goto out; dump_decrypted_data(epayload); - memset(pad, 0, sizeof pad); sg_init_table(sg_in, 2); sg_set_buf(&sg_in[0], epayload->decrypted_data, epayload->decrypted_datalen); - sg_set_buf(&sg_in[1], pad, padlen); + sg_set_page(&sg_in[1], ZERO_PAGE(0), AES_BLOCK_SIZE, 0); sg_init_table(sg_out, 1); sg_set_buf(sg_out, epayload->encrypted_data, encrypted_datalen); @@ -584,9 +580,14 @@ static int derived_key_decrypt(struct encrypted_key_payload *epayload, struct skcipher_request *req; unsigned int encrypted_datalen; u8 iv[AES_BLOCK_SIZE]; - char pad[16]; + u8 *pad; int ret; + /* Throwaway buffer to hold the unused zero padding at the end */ + pad = kmalloc(AES_BLOCK_SIZE, GFP_KERNEL); + if (!pad) + return -ENOMEM; + encrypted_datalen = roundup(epayload->decrypted_datalen, blksize); req = init_skcipher_req(derived_key, derived_keylen); ret = PTR_ERR(req); @@ -594,13 +595,12 @@ static int derived_key_decrypt(struct encrypted_key_payload *epayload, goto out; dump_encrypted_data(epayload, encrypted_datalen); - memset(pad, 0, sizeof pad); sg_init_table(sg_in, 1); sg_init_table(sg_out, 2); sg_set_buf(sg_in, epayload->encrypted_data, encrypted_datalen); sg_set_buf(&sg_out[0], epayload->decrypted_data, epayload->decrypted_datalen); - sg_set_buf(&sg_out[1], pad, sizeof pad); + sg_set_buf(&sg_out[1], pad, AES_BLOCK_SIZE); memcpy(iv, epayload->iv, sizeof(iv)); skcipher_request_set_crypt(req, sg_in, sg_out, encrypted_datalen, iv); @@ -612,6 +612,7 @@ static int derived_key_decrypt(struct encrypted_key_payload *epayload, goto out; dump_decrypted_data(epayload); out: + kfree(pad); return ret; } From 2d0280070e6c3f0028ca462b59db5c0a45d36299 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Thu, 18 May 2017 16:29:24 +0300 Subject: [PATCH 137/235] crypto: drbg - wait for crypto op not signal safe commit a5dfefb1c3f3db81662556393fd9283511e08430 upstream. drbg_kcapi_sym_ctr() was using wait_for_completion_interruptible() to wait for completion of async crypto op but if a signal occurs it may return before DMA ops of HW crypto provider finish, thus corrupting the output buffer. Resolve this by using wait_for_completion() instead. Reported-by: Eric Biggers Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/drbg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crypto/drbg.c b/crypto/drbg.c index 053035b5c8f8..123d211efa12 100644 --- a/crypto/drbg.c +++ b/crypto/drbg.c @@ -1768,9 +1768,8 @@ static int drbg_kcapi_sym_ctr(struct drbg_state *drbg, break; case -EINPROGRESS: case -EBUSY: - ret = wait_for_completion_interruptible( - &drbg->ctr_completion); - if (!ret && !drbg->ctr_async_err) { + wait_for_completion(&drbg->ctr_completion); + if (!drbg->ctr_async_err) { reinit_completion(&drbg->ctr_completion); break; } From d4783eb9f0828cfc439e7deeaaa19b08b7665932 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Thu, 18 May 2017 16:29:25 +0300 Subject: [PATCH 138/235] crypto: gcm - wait for crypto op not signal safe commit f3ad587070d6bd961ab942b3fd7a85d00dfc934b upstream. crypto_gcm_setkey() was using wait_for_completion_interruptible() to wait for completion of async crypto op but if a signal occurs it may return before DMA ops of HW crypto provider finish, thus corrupting the data buffer that is kfree'ed in this case. Resolve this by using wait_for_completion() instead. Reported-by: Eric Biggers Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/gcm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/crypto/gcm.c b/crypto/gcm.c index f624ac98c94e..dd33fbd2d868 100644 --- a/crypto/gcm.c +++ b/crypto/gcm.c @@ -152,10 +152,8 @@ static int crypto_gcm_setkey(struct crypto_aead *aead, const u8 *key, err = crypto_skcipher_encrypt(&data->req); if (err == -EINPROGRESS || err == -EBUSY) { - err = wait_for_completion_interruptible( - &data->result.completion); - if (!err) - err = data->result.err; + wait_for_completion(&data->result.completion); + err = data->result.err; } if (err) From 34bae9b3ba98fd4c4a2658d418936452e82f3a24 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 11 May 2017 13:10:02 -0400 Subject: [PATCH 139/235] drm/amdgpu/ci: disable mclk switching for high refresh rates (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0a646f331db0eb9efc8d3a95a44872036d441d58 upstream. Even if the vblank period would allow it, it still seems to be problematic on some cards. v2: fix logic inversion (Nils) bug: https://bugs.freedesktop.org/show_bug.cgi?id=96868 Acked-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/ci_dpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/ci_dpm.c b/drivers/gpu/drm/amd/amdgpu/ci_dpm.c index 5be788b269e2..1679727c22ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/ci_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/ci_dpm.c @@ -900,6 +900,12 @@ static bool ci_dpm_vblank_too_short(struct amdgpu_device *adev) u32 vblank_time = amdgpu_dpm_get_vblank_time(adev); u32 switch_limit = adev->mc.vram_type == AMDGPU_VRAM_TYPE_GDDR5 ? 450 : 300; + /* disable mclk switching if the refresh is >120Hz, even if the + * blanking period would allow it + */ + if (amdgpu_dpm_get_vrefresh(adev) > 120) + return true; + if (vblank_time < switch_limit) return true; else From bfeac838043ff964dda57c1522b502860513ca01 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 23 May 2017 12:24:40 -0400 Subject: [PATCH 140/235] nfsd4: fix null dereference on replay commit 9a307403d374b993061f5992a6e260c944920d0b upstream. if we receive a compound such that: - the sessionid, slot, and sequence number in the SEQUENCE op match a cached succesful reply with N ops, and - the Nth operation of the compound is a PUTFH, PUTPUBFH, PUTROOTFH, or RESTOREFH, then nfsd4_sequence will return 0 and set cstate->status to nfserr_replay_cache. The current filehandle will not be set. This will cause us to call check_nfsd_access with first argument NULL. To nfsd4_compound it looks like we just succesfully executed an operation that set a filehandle, but the current filehandle is not set. Fix this by moving the nfserr_replay_cache earlier. There was never any reason to have it after the encode_op label, since the only case where he hit that is when opdesc->op_func sets it. Note that there are two ways we could hit this case: - a client is resending a previously sent compound that ended with one of the four PUTFH-like operations, or - a client is sending a *new* compound that (incorrectly) shares sessionid, slot, and sequence number with a previously sent compound, and the length of the previously sent compound happens to match the position of a PUTFH-like operation in the new compound. The second is obviously incorrect client behavior. The first is also very strange--the only purpose of a PUTFH-like operation is to set the current filehandle to be used by the following operation, so there's no point in having it as the last in a compound. So it's likely this requires a buggy or malicious client to reproduce. Reported-by: Scott Mayhew Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4proc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 650226f33298..022d95886d66 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1783,6 +1783,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, opdesc->op_get_currentstateid(cstate, &op->u); op->status = opdesc->op_func(rqstp, cstate, &op->u); + /* Only from SEQUENCE */ + if (cstate->status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } if (!op->status) { if (opdesc->op_set_currentstateid) opdesc->op_set_currentstateid(cstate, &op->u); @@ -1793,14 +1799,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (need_wrongsec_check(rqstp)) op->status = check_nfsd_access(current_fh->fh_export, rqstp); } - encode_op: - /* Only from SEQUENCE */ - if (cstate->status == nfserr_replay_cache) { - dprintk("%s NFS4.1 replay from cache\n", __func__); - status = op->status; - goto out; - } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(&resp->xdr, op); From e273ed246617a178851f1e761dbef1716f1b34da Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 May 2017 16:24:59 -0400 Subject: [PATCH 141/235] nfsd: Fix up the "supattr_exclcreat" attributes commit b26b78cb726007533d81fdf90a62e915002ef5c8 upstream. If an NFSv4 client asks us for the supattr_exclcreat, then we must not return attributes that are unsupported by this minor version. Signed-off-by: Trond Myklebust Fixes: 75976de6556f ("NFSD: Return word2 bitmask if setting security..,") Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4xdr.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2ee80e1f5230..4e7a56a0a9b6 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2793,9 +2793,16 @@ out_acl: } #endif /* CONFIG_NFSD_PNFS */ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0, - NFSD_SUPPATTR_EXCLCREAT_WORD1, - NFSD_SUPPATTR_EXCLCREAT_WORD2); + u32 supp[3]; + + supp[0] = nfsd_suppattrs0(minorversion); + supp[1] = nfsd_suppattrs1(minorversion); + supp[2] = nfsd_suppattrs2(minorversion); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); if (status) goto out; } From 4809f0e56d98a64dce68b98a403b51d384fce617 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 26 May 2017 12:36:47 +0100 Subject: [PATCH 142/235] efi: Don't issue error message when booted under Xen commit 1ea34adb87c969b89dfd83f1905a79161e9ada26 upstream. When booted as Xen dom0 there won't be an EFI memmap allocated. Avoid issuing an error message in this case: [ 0.144079] efi: Failed to allocate new EFI memmap Signed-off-by: Juergen Gross Signed-off-by: Matt Fleming Cc: Ard Biesheuvel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20170526113652.21339-2-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/platform/efi/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index cdfe8c628959..393a0c0288d1 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -358,6 +358,9 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } + if (!num_entries) + return; + new_size = efi.memmap.desc_size * num_entries; new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { From 78f87ce2a17f0aa625451835ffaceb6b607b4a9a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 26 Apr 2017 16:56:26 +0200 Subject: [PATCH 143/235] kvm: async_pf: fix rcu_irq_enter() with irqs enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bbaf0e2b1c1b4f88abd6ef49576f0efb1734eae5 upstream. native_safe_halt enables interrupts, and you just shouldn't call rcu_irq_enter() with interrupts enabled. Reorder the call with the following local_irq_disable() to respect the invariant. Reported-by: Ross Zwisler Signed-off-by: Paolo Bonzini Acked-by: Paul E. McKenney Tested-by: Wanpeng Li Signed-off-by: Radim Krčmář Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index edbbfc854e39..9cf697ceedbf 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -162,8 +162,8 @@ void kvm_async_pf_task_wait(u32 token) */ rcu_irq_exit(); native_safe_halt(); - rcu_irq_enter(); local_irq_disable(); + rcu_irq_enter(); } } if (!n.halted) From 19c9a115085edc6e2f1ee8e7c30600ba2fa932f4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 8 Jun 2017 01:22:07 -0700 Subject: [PATCH 144/235] KVM: cpuid: Fix read/write out-of-bounds vulnerability in cpuid emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a3641631d14571242eec0d30c9faa786cbf52d44 upstream. If "i" is the last element in the vcpu->arch.cpuid_entries[] array, it potentially can be exploited the vulnerability. this will out-of-bounds read and write. Luckily, the effect is small: /* when no next entry is found, the current entry[i] is reselected */ for (j = i + 1; ; j = (j + 1) % nent) { struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; if (ej->function == e->function) { It reads ej->maxphyaddr, which is user controlled. However... ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; After cpuid_entries there is int maxphyaddr; struct x86_emulate_ctxt emulate_ctxt; /* 16-byte aligned */ So we have: - cpuid_entries at offset 1B50 (6992) - maxphyaddr at offset 27D0 (6992 + 3200 = 10192) - padding at 27D4...27DF - emulate_ctxt at 27E0 And it writes in the padding. Pfew, writing the ops field of emulate_ctxt would have been much worse. This patch fixes it by modding the index to avoid the out-of-bounds access. Worst case, i == j and ej->function == e->function, the loop can bail out. Reported-by: Moguofang Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Guofang Mo Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/cpuid.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 967e459ff1e6..649d8f2c1e40 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -765,18 +765,20 @@ out: static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; + struct kvm_cpuid_entry2 *ej; + int j = i; + int nent = vcpu->arch.cpuid_nent; e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ + do { + j = (j + 1) % nent; + ej = &vcpu->arch.cpuid_entries[j]; + } while (ej->function != e->function); + + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + + return j; } /* find an entry with matching function, matching index (if needed), and that From b9824dd75fcf6d9f10e8df08b237948d1802d5fe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Jun 2017 19:08:33 +0100 Subject: [PATCH 145/235] arm64: KVM: Preserve RES1 bits in SCTLR_EL2 commit d68c1f7fd1b7148dab5fe658321d511998969f2d upstream. __do_hyp_init has the rather bad habit of ignoring RES1 bits and writing them back as zero. On a v8.0-8.2 CPU, this doesn't do anything bad, but may end-up being pretty nasty on future revisions of the architecture. Let's preserve those bits so that we don't have to fix this later on. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/sysreg.h | 4 ++++ arch/arm64/kvm/hyp-init.S | 10 ++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6c80b3699cb8..7393cc767edb 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -94,6 +94,10 @@ #define SCTLR_ELx_A (1 << 1) #define SCTLR_ELx_M 1 +#define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \ + (1 << 16) | (1 << 18) | (1 << 22) | (1 << 23) | \ + (1 << 28) | (1 << 29)) + #define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ SCTLR_ELx_SA | SCTLR_ELx_I) diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 6b29d3d9e1f2..4e460d973560 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -102,10 +102,12 @@ __do_hyp_init: tlbi alle2 dsb sy - mrs x4, sctlr_el2 - and x4, x4, #SCTLR_ELx_EE // preserve endianness of EL2 - ldr x5, =SCTLR_ELx_FLAGS - orr x4, x4, x5 + /* + * Preserve all the RES1 bits while setting the default flags, + * as well as the EE bit on BE. + */ + ldr x4, =(SCTLR_EL2_RES1 | SCTLR_ELx_FLAGS) +CPU_BE( orr x4, x4, #SCTLR_ELx_EE) msr sctlr_el2, x4 isb From 8abce1e49c82599e6c8ced2d3276247b06148526 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Jun 2017 19:08:34 +0100 Subject: [PATCH 146/235] arm64: KVM: Allow unaligned accesses at EL2 commit 78fd6dcf11468a5a131b8365580d0c613bcc02cb upstream. We currently have the SCTLR_EL2.A bit set, trapping unaligned accesses at EL2, but we're not really prepared to deal with it. So far, this has been unnoticed, until GCC 7 started emitting those (in particular 64bit writes on a 32bit boundary). Since the rest of the kernel is pretty happy about that, let's follow its example and set SCTLR_EL2.A to zero. Modern CPUs don't really care. Reported-by: Alexander Graf Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/hyp-init.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 4e460d973560..4bbff904169d 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -104,9 +104,10 @@ __do_hyp_init: /* * Preserve all the RES1 bits while setting the default flags, - * as well as the EE bit on BE. + * as well as the EE bit on BE. Drop the A flag since the compiler + * is allowed to generate unaligned accesses. */ - ldr x4, =(SCTLR_EL2_RES1 | SCTLR_ELx_FLAGS) + ldr x4, =(SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) CPU_BE( orr x4, x4, #SCTLR_ELx_EE) msr sctlr_el2, x4 isb From 85c19308cb374be14fcd24a8c317d5d48f325f24 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Jun 2017 19:08:35 +0100 Subject: [PATCH 147/235] arm: KVM: Allow unaligned accesses at HYP commit 33b5c38852b29736f3b472dd095c9a18ec22746f upstream. We currently have the HSCTLR.A bit set, trapping unaligned accesses at HYP, but we're not really prepared to deal with it. Since the rest of the kernel is pretty happy about that, let's follow its example and set HSCTLR.A to zero. Modern CPUs don't really care. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- arch/arm/kvm/init.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S index bf89c919efc1..bd0ee7fc304c 100644 --- a/arch/arm/kvm/init.S +++ b/arch/arm/kvm/init.S @@ -95,7 +95,6 @@ __do_hyp_init: @ - Write permission implies XN: disabled @ - Instruction cache: enabled @ - Data/Unified cache: enabled - @ - Memory alignment checks: enabled @ - MMU: enabled (this code must be run from an identity mapping) mrc p15, 4, r0, c1, c0, 0 @ HSCR ldr r2, =HSCTLR_MASK @@ -103,8 +102,8 @@ __do_hyp_init: mrc p15, 0, r1, c1, c0, 0 @ SCTLR ldr r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) and r1, r1, r2 - ARM( ldr r2, =(HSCTLR_M | HSCTLR_A) ) - THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) + ARM( ldr r2, =(HSCTLR_M) ) + THUMB( ldr r2, =(HSCTLR_M | HSCTLR_TE) ) orr r1, r1, r2 orr r0, r0, r1 mcr p15, 4, r0, c1, c0, 0 @ HSCR From 81555e45852a69389a71167e913722714c91fd3f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 8 Jun 2017 20:13:40 -0700 Subject: [PATCH 148/235] KVM: async_pf: avoid async pf injection when in guest mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9bc1f09f6fa76fdf31eb7d6a4a4df43574725f93 upstream. INFO: task gnome-terminal-:1734 blocked for more than 120 seconds. Not tainted 4.12.0-rc4+ #8 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. gnome-terminal- D 0 1734 1015 0x00000000 Call Trace: __schedule+0x3cd/0xb30 schedule+0x40/0x90 kvm_async_pf_task_wait+0x1cc/0x270 ? __vfs_read+0x37/0x150 ? prepare_to_swait+0x22/0x70 do_async_page_fault+0x77/0xb0 ? do_async_page_fault+0x77/0xb0 async_page_fault+0x28/0x30 This is triggered by running both win7 and win2016 on L1 KVM simultaneously, and then gives stress to memory on L1, I can observed this hang on L1 when at least ~70% swap area is occupied on L0. This is due to async pf was injected to L2 which should be injected to L1, L2 guest starts receiving pagefault w/ bogus %cr2(apf token from the host actually), and L1 guest starts accumulating tasks stuck in D state in kvm_async_pf_task_wait() since missing PAGE_READY async_pfs. This patch fixes the hang by doing async pf when executing L1 guest. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mmu.c | 7 +++++-- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/x86.c | 3 +-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e986b4e4..5f2412704b81 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3489,12 +3489,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool can_do_async_pf(struct kvm_vcpu *vcpu) +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; + if (is_guest_mode(vcpu)) + return false; + return kvm_x86_ops->interrupt_allowed(vcpu); } @@ -3510,7 +3513,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!async) return false; /* *pfn has correct page already */ - if (!prefault && can_do_async_pf(vcpu)) { + if (!prefault && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(gva, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(gva, gfn); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ddc56e91f2e4..c92834c55c59 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -75,6 +75,7 @@ enum { int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 81bba3c2137d..62cde4f67c72 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8444,8 +8444,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) return true; else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_can_do_async_pf(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) From 2a5c08a4d3fef6bc645c2ae0607f00672e367f75 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 May 2017 14:30:40 +0100 Subject: [PATCH 149/235] KVM: arm/arm64: vgic-v3: Do not use Active+Pending state for a HW interrupt commit 3d6e77ad1489650afa20da92bb589c8778baa8da upstream. When an interrupt is injected with the HW bit set (indicating that deactivation should be propagated to the physical distributor), special care must be taken so that we never mark the corresponding LR with the Active+Pending state (as the pending state is kept in the physycal distributor). Fixes: 59529f69f504 ("KVM: arm/arm64: vgic-new: Add GICv3 world switch backend") Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- virt/kvm/arm/vgic/vgic-v3.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index e6b03fd8c374..f1320063db28 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -151,6 +151,13 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (irq->hw) { val |= ICH_LR_HW; val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active && irq->pending) + val &= ~ICH_LR_PENDING_BIT; } else { if (irq->config == VGIC_CONFIG_LEVEL) val |= ICH_LR_EOI; From 3e7a76b290f12668e5a57d2f7796f789d4317bc3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 May 2017 14:30:39 +0100 Subject: [PATCH 150/235] KVM: arm/arm64: vgic-v2: Do not use Active+Pending state for a HW interrupt commit ddf42d068f8802de122bb7efdfcb3179336053f1 upstream. When an interrupt is injected with the HW bit set (indicating that deactivation should be propagated to the physical distributor), special care must be taken so that we never mark the corresponding LR with the Active+Pending state (as the pending state is kept in the physycal distributor). Cc: stable@vger.kernel.org Fixes: 140b086dd197 ("KVM: arm/arm64: vgic-new: Add GICv2 world switch backend") Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- virt/kvm/arm/vgic/vgic-v2.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 834137e7b83f..1ab58f7b5d74 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -168,6 +168,13 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (irq->hw) { val |= GICH_LR_HW; val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; + /* + * Never set pending+active on a HW interrupt, as the + * pending state is kept at the physical distributor + * level. + */ + if (irq->active && irq->pending) + val &= ~GICH_LR_PENDING_BIT; } else { if (irq->config == VGIC_CONFIG_LEVEL) val |= GICH_LR_EOI; From cd0ef520aa705475184b4639d4b7c14e36b520bb Mon Sep 17 00:00:00 2001 From: Hiroyuki Yokoyama Date: Mon, 15 May 2017 17:49:52 +0900 Subject: [PATCH 151/235] dmaengine: usb-dmac: Fix DMAOR AE bit definition commit 9a445bbb1607d9f14556a532453dd86d1b7e381e upstream. This patch fixes the register definition of AE (Address Error flag) bit. Fixes: 0c1c8ff32fa2 ("dmaengine: usb-dmac: Add Renesas USB DMA Controller (USB-DMAC) driver") Signed-off-by: Hiroyuki Yokoyama [Shimoda: add Fixes and Cc tags in the commit log] Signed-off-by: Yoshihiro Shimoda Reviewed-by: Geert Uytterhoeven Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/sh/usb-dmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/sh/usb-dmac.c b/drivers/dma/sh/usb-dmac.c index 06ecdc38cee0..6682b3eec2b6 100644 --- a/drivers/dma/sh/usb-dmac.c +++ b/drivers/dma/sh/usb-dmac.c @@ -117,7 +117,7 @@ struct usb_dmac { #define USB_DMASWR 0x0008 #define USB_DMASWR_SWR (1 << 0) #define USB_DMAOR 0x0060 -#define USB_DMAOR_AE (1 << 2) +#define USB_DMAOR_AE (1 << 1) #define USB_DMAOR_DME (1 << 0) #define USB_DMASAR 0x0000 From b7e7a4d52a95976093edea6e13bd1c09ef86665d Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Mon, 22 May 2017 16:05:22 +0200 Subject: [PATCH 152/235] dmaengine: ep93xx: Always start from BASE0 commit 0037ae47812b1f431cc602100d1d51f37d77b61e upstream. The current buffer is being reset to zero on device_free_chan_resources() but not on device_terminate_all(). It could happen that HW is restarted and expects BASE0 to be used, but the driver is not synchronized and will start from BASE1. One solution is to reset the buffer explicitly in m2p_hw_setup(). Signed-off-by: Alexander Sverdlin Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/ep93xx_dma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c index d37e8dda8079..deb009c3121f 100644 --- a/drivers/dma/ep93xx_dma.c +++ b/drivers/dma/ep93xx_dma.c @@ -323,6 +323,8 @@ static int m2p_hw_setup(struct ep93xx_dma_chan *edmac) | M2P_CONTROL_ENABLE; m2p_set_control(edmac, control); + edmac->buffer = 0; + return 0; } From f2e9d10bf1a21a982ee179bab798e3801eb410af Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Mon, 22 May 2017 16:05:23 +0200 Subject: [PATCH 153/235] dmaengine: ep93xx: Don't drain the transfers in terminate_all() commit 98f9de366fccee7572c646af226b2d4b4841e3b5 upstream. Draining the transfers in terminate_all callback happens with IRQs disabled, therefore induces huge latency: irqsoff latency trace v1.1.5 on 4.11.0 -------------------------------------------------------------------- latency: 39770 us, #57/57, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0) ----------------- | task: process-129 (uid:0 nice:0 policy:2 rt_prio:50) ----------------- => started at: _snd_pcm_stream_lock_irqsave => ended at: snd_pcm_stream_unlock_irqrestore _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / delay cmd pid ||||| time | caller \ / ||||| \ | / process-129 0d.s. 3us : _snd_pcm_stream_lock_irqsave process-129 0d.s1 9us : snd_pcm_stream_lock <-_snd_pcm_stream_lock_irqsave process-129 0d.s1 15us : preempt_count_add <-snd_pcm_stream_lock process-129 0d.s2 22us : preempt_count_add <-snd_pcm_stream_lock process-129 0d.s3 32us : snd_pcm_update_hw_ptr0 <-snd_pcm_period_elapsed process-129 0d.s3 41us : soc_pcm_pointer <-snd_pcm_update_hw_ptr0 process-129 0d.s3 50us : dmaengine_pcm_pointer <-soc_pcm_pointer process-129 0d.s3 58us+: snd_dmaengine_pcm_pointer_no_residue <-dmaengine_pcm_pointer process-129 0d.s3 96us : update_audio_tstamp <-snd_pcm_update_hw_ptr0 process-129 0d.s3 103us : snd_pcm_update_state <-snd_pcm_update_hw_ptr0 process-129 0d.s3 112us : xrun <-snd_pcm_update_state process-129 0d.s3 119us : snd_pcm_stop <-xrun process-129 0d.s3 126us : snd_pcm_action <-snd_pcm_stop process-129 0d.s3 134us : snd_pcm_action_single <-snd_pcm_action process-129 0d.s3 141us : snd_pcm_pre_stop <-snd_pcm_action_single process-129 0d.s3 150us : snd_pcm_do_stop <-snd_pcm_action_single process-129 0d.s3 157us : soc_pcm_trigger <-snd_pcm_do_stop process-129 0d.s3 166us : snd_dmaengine_pcm_trigger <-soc_pcm_trigger process-129 0d.s3 175us : ep93xx_dma_terminate_all <-snd_dmaengine_pcm_trigger process-129 0d.s3 182us : preempt_count_add <-ep93xx_dma_terminate_all process-129 0d.s4 189us*: m2p_hw_shutdown <-ep93xx_dma_terminate_all process-129 0d.s4 39472us : m2p_hw_setup <-ep93xx_dma_terminate_all ... rest skipped... process-129 0d.s. 40080us : => ep93xx_dma_tasklet => tasklet_action => __do_softirq => irq_exit => __handle_domain_irq => vic_handle_irq => __irq_usr => 0xb66c6668 Just abort the transfers and warn if the HW state is not what we expect. Move draining into device_synchronize callback. Signed-off-by: Alexander Sverdlin Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/ep93xx_dma.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c index deb009c3121f..ec240592f5c8 100644 --- a/drivers/dma/ep93xx_dma.c +++ b/drivers/dma/ep93xx_dma.c @@ -201,6 +201,7 @@ struct ep93xx_dma_engine { struct dma_device dma_dev; bool m2m; int (*hw_setup)(struct ep93xx_dma_chan *); + void (*hw_synchronize)(struct ep93xx_dma_chan *); void (*hw_shutdown)(struct ep93xx_dma_chan *); void (*hw_submit)(struct ep93xx_dma_chan *); int (*hw_interrupt)(struct ep93xx_dma_chan *); @@ -333,21 +334,27 @@ static inline u32 m2p_channel_state(struct ep93xx_dma_chan *edmac) return (readl(edmac->regs + M2P_STATUS) >> 4) & 0x3; } -static void m2p_hw_shutdown(struct ep93xx_dma_chan *edmac) +static void m2p_hw_synchronize(struct ep93xx_dma_chan *edmac) { + unsigned long flags; u32 control; + spin_lock_irqsave(&edmac->lock, flags); control = readl(edmac->regs + M2P_CONTROL); control &= ~(M2P_CONTROL_STALLINT | M2P_CONTROL_NFBINT); m2p_set_control(edmac, control); + spin_unlock_irqrestore(&edmac->lock, flags); while (m2p_channel_state(edmac) >= M2P_STATE_ON) - cpu_relax(); + schedule(); +} +static void m2p_hw_shutdown(struct ep93xx_dma_chan *edmac) +{ m2p_set_control(edmac, 0); - while (m2p_channel_state(edmac) == M2P_STATE_STALL) - cpu_relax(); + while (m2p_channel_state(edmac) != M2P_STATE_IDLE) + dev_warn(chan2dev(edmac), "M2P: Not yet IDLE\n"); } static void m2p_fill_desc(struct ep93xx_dma_chan *edmac) @@ -1162,6 +1169,26 @@ fail: return NULL; } +/** + * ep93xx_dma_synchronize - Synchronizes the termination of transfers to the + * current context. + * @chan: channel + * + * Synchronizes the DMA channel termination to the current context. When this + * function returns it is guaranteed that all transfers for previously issued + * descriptors have stopped and and it is safe to free the memory associated + * with them. Furthermore it is guaranteed that all complete callback functions + * for a previously submitted descriptor have finished running and it is safe to + * free resources accessed from within the complete callbacks. + */ +static void ep93xx_dma_synchronize(struct dma_chan *chan) +{ + struct ep93xx_dma_chan *edmac = to_ep93xx_dma_chan(chan); + + if (edmac->edma->hw_synchronize) + edmac->edma->hw_synchronize(edmac); +} + /** * ep93xx_dma_terminate_all - terminate all transactions * @chan: channel @@ -1325,6 +1352,7 @@ static int __init ep93xx_dma_probe(struct platform_device *pdev) dma_dev->device_prep_slave_sg = ep93xx_dma_prep_slave_sg; dma_dev->device_prep_dma_cyclic = ep93xx_dma_prep_dma_cyclic; dma_dev->device_config = ep93xx_dma_slave_config; + dma_dev->device_synchronize = ep93xx_dma_synchronize; dma_dev->device_terminate_all = ep93xx_dma_terminate_all; dma_dev->device_issue_pending = ep93xx_dma_issue_pending; dma_dev->device_tx_status = ep93xx_dma_tx_status; @@ -1342,6 +1370,7 @@ static int __init ep93xx_dma_probe(struct platform_device *pdev) } else { dma_cap_set(DMA_PRIVATE, dma_dev->cap_mask); + edma->hw_synchronize = m2p_hw_synchronize; edma->hw_setup = m2p_hw_setup; edma->hw_shutdown = m2p_hw_shutdown; edma->hw_submit = m2p_hw_submit; From f08c84d4c7450135ccd90e7d4ce10335d998ea6f Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 May 2017 11:57:44 +0200 Subject: [PATCH 154/235] dmaengine: mv_xor_v2: handle mv_xor_v2_prep_sw_desc() error properly commit eb8df543e444492328f506adffc7dfe94111f1bd upstream. The mv_xor_v2_prep_sw_desc() is called from a few different places in the driver, but we never take into account the fact that it might return NULL. This commit fixes that, ensuring that we don't panic if there are no more descriptors available. Fixes: 19a340b1a820 ("dmaengine: mv_xor_v2: new driver") Signed-off-by: Thomas Petazzoni Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/mv_xor_v2.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c index a28a01fcba67..e9280207ac19 100644 --- a/drivers/dma/mv_xor_v2.c +++ b/drivers/dma/mv_xor_v2.c @@ -389,6 +389,8 @@ mv_xor_v2_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, __func__, len, &src, &dest, flags); sw_desc = mv_xor_v2_prep_sw_desc(xor_dev); + if (!sw_desc) + return NULL; sw_desc->async_tx.flags = flags; @@ -443,6 +445,8 @@ mv_xor_v2_prep_dma_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, __func__, src_cnt, len, &dest, flags); sw_desc = mv_xor_v2_prep_sw_desc(xor_dev); + if (!sw_desc) + return NULL; sw_desc->async_tx.flags = flags; @@ -491,6 +495,8 @@ mv_xor_v2_prep_dma_interrupt(struct dma_chan *chan, unsigned long flags) container_of(chan, struct mv_xor_v2_device, dmachan); sw_desc = mv_xor_v2_prep_sw_desc(xor_dev); + if (!sw_desc) + return NULL; /* set the HW descriptor */ hw_descriptor = &sw_desc->hw_desc; From 67b1684c4a5e092263b2b9fc3670450eedb620cf Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 May 2017 11:57:45 +0200 Subject: [PATCH 155/235] dmaengine: mv_xor_v2: properly handle wrapping in the array of HW descriptors commit 2aab4e18152cd30cb5d2f4c27629fc8a04aed979 upstream. mv_xor_v2_tasklet() is looping over completed HW descriptors. Before the loop, it initializes 'next_pending_hw_desc' to the first HW descriptor to handle, and then the loop simply increments this point, without taking care of wrapping when we reach the last HW descriptor. The 'pending_ptr' index was being wrapped back to 0 at the end, but it wasn't used in each iteration of the loop to calculate next_pending_hw_desc. This commit fixes that, and makes next_pending_hw_desc a variable local to the loop itself. Fixes: 19a340b1a820 ("dmaengine: mv_xor_v2: new driver") Signed-off-by: Thomas Petazzoni Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/mv_xor_v2.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c index e9280207ac19..bdfb36ecff81 100644 --- a/drivers/dma/mv_xor_v2.c +++ b/drivers/dma/mv_xor_v2.c @@ -560,7 +560,6 @@ static void mv_xor_v2_tasklet(unsigned long data) { struct mv_xor_v2_device *xor_dev = (struct mv_xor_v2_device *) data; int pending_ptr, num_of_pending, i; - struct mv_xor_v2_descriptor *next_pending_hw_desc = NULL; struct mv_xor_v2_sw_desc *next_pending_sw_desc = NULL; dev_dbg(xor_dev->dmadev.dev, "%s %d\n", __func__, __LINE__); @@ -568,17 +567,10 @@ static void mv_xor_v2_tasklet(unsigned long data) /* get the pending descriptors parameters */ num_of_pending = mv_xor_v2_get_pending_params(xor_dev, &pending_ptr); - /* next HW descriptor */ - next_pending_hw_desc = xor_dev->hw_desq_virt + pending_ptr; - /* loop over free descriptors */ for (i = 0; i < num_of_pending; i++) { - - if (pending_ptr > MV_XOR_V2_DESC_NUM) - pending_ptr = 0; - - if (next_pending_sw_desc != NULL) - next_pending_hw_desc++; + struct mv_xor_v2_descriptor *next_pending_hw_desc = + xor_dev->hw_desq_virt + pending_ptr; /* get the SW descriptor related to the HW descriptor */ next_pending_sw_desc = @@ -614,6 +606,8 @@ static void mv_xor_v2_tasklet(unsigned long data) /* increment the next descriptor */ pending_ptr++; + if (pending_ptr >= MV_XOR_V2_DESC_NUM) + pending_ptr = 0; } if (num_of_pending != 0) { From e2a092eab8a5b0390e48c04b5d35c85d1c8900be Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 May 2017 11:57:46 +0200 Subject: [PATCH 156/235] dmaengine: mv_xor_v2: do not use descriptors not acked by async_tx commit bc473da1ed726c975ad47f8d7d27631de11356d8 upstream. Descriptors that have not been acknowledged by the async_tx layer should not be re-used, so this commit adjusts the implementation of mv_xor_v2_prep_sw_desc() to skip descriptors for which async_tx_test_ack() is false. Fixes: 19a340b1a820 ("dmaengine: mv_xor_v2: new driver") Signed-off-by: Thomas Petazzoni Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/mv_xor_v2.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c index bdfb36ecff81..cb60e7c4aa16 100644 --- a/drivers/dma/mv_xor_v2.c +++ b/drivers/dma/mv_xor_v2.c @@ -344,6 +344,7 @@ static struct mv_xor_v2_sw_desc * mv_xor_v2_prep_sw_desc(struct mv_xor_v2_device *xor_dev) { struct mv_xor_v2_sw_desc *sw_desc; + bool found = false; /* Lock the channel */ spin_lock_bh(&xor_dev->lock); @@ -355,19 +356,23 @@ mv_xor_v2_prep_sw_desc(struct mv_xor_v2_device *xor_dev) return NULL; } - /* get a free SW descriptor from the SW DESQ */ - sw_desc = list_first_entry(&xor_dev->free_sw_desc, - struct mv_xor_v2_sw_desc, free_list); + list_for_each_entry(sw_desc, &xor_dev->free_sw_desc, free_list) { + if (async_tx_test_ack(&sw_desc->async_tx)) { + found = true; + break; + } + } + + if (!found) { + spin_unlock_bh(&xor_dev->lock); + return NULL; + } + list_del(&sw_desc->free_list); /* Release the channel */ spin_unlock_bh(&xor_dev->lock); - /* set the async tx descriptor */ - dma_async_tx_descriptor_init(&sw_desc->async_tx, &xor_dev->dmachan); - sw_desc->async_tx.tx_submit = mv_xor_v2_tx_submit; - async_tx_ack(&sw_desc->async_tx); - return sw_desc; } @@ -785,8 +790,15 @@ static int mv_xor_v2_probe(struct platform_device *pdev) /* add all SW descriptors to the free list */ for (i = 0; i < MV_XOR_V2_DESC_NUM; i++) { - xor_dev->sw_desq[i].idx = i; - list_add(&xor_dev->sw_desq[i].free_list, + struct mv_xor_v2_sw_desc *sw_desc = + xor_dev->sw_desq + i; + sw_desc->idx = i; + dma_async_tx_descriptor_init(&sw_desc->async_tx, + &xor_dev->dmachan); + sw_desc->async_tx.tx_submit = mv_xor_v2_tx_submit; + async_tx_ack(&sw_desc->async_tx); + + list_add(&sw_desc->free_list, &xor_dev->free_sw_desc); } From 0d0918504a96d406d1a8beedab47bfb93c01ee67 Mon Sep 17 00:00:00 2001 From: Hanna Hawa Date: Fri, 5 May 2017 11:57:47 +0200 Subject: [PATCH 157/235] dmaengine: mv_xor_v2: enable XOR engine after its configuration commit ab2c5f0a77fe49bdb6e307b397496373cb47d2c2 upstream. The engine was enabled prior to its configuration, which isn't correct. This patch relocates the activation of the XOR engine, to be after the configuration of the XOR engine. Fixes: 19a340b1a820 ("dmaengine: mv_xor_v2: new driver") Signed-off-by: Hanna Hawa Signed-off-by: Thomas Petazzoni Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/mv_xor_v2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c index cb60e7c4aa16..211b8c0e3cfb 100644 --- a/drivers/dma/mv_xor_v2.c +++ b/drivers/dma/mv_xor_v2.c @@ -653,9 +653,6 @@ static int mv_xor_v2_descq_init(struct mv_xor_v2_device *xor_dev) writel((xor_dev->hw_desq & 0xFFFF00000000) >> 32, xor_dev->dma_base + MV_XOR_V2_DMA_DESQ_BAHR_OFF); - /* enable the DMA engine */ - writel(0, xor_dev->dma_base + MV_XOR_V2_DMA_DESQ_STOP_OFF); - /* * This is a temporary solution, until we activate the * SMMU. Set the attributes for reading & writing data buffers @@ -699,6 +696,9 @@ static int mv_xor_v2_descq_init(struct mv_xor_v2_device *xor_dev) reg |= MV_XOR_V2_GLOB_PAUSE_AXI_TIME_DIS_VAL; writel(reg, xor_dev->glob_base + MV_XOR_V2_GLOB_PAUSE); + /* enable the DMA engine */ + writel(0, xor_dev->dma_base + MV_XOR_V2_DMA_DESQ_STOP_OFF); + return 0; } From b2c8bb06bc8322951e426cbafe45deae36a124da Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 May 2017 11:57:48 +0200 Subject: [PATCH 158/235] dmaengine: mv_xor_v2: fix tx_submit() implementation commit 44d5887a8bf1e86915c8ff647337cb138149da82 upstream. The mv_xor_v2_tx_submit() gets the next available HW descriptor by calling mv_xor_v2_get_desq_write_ptr(), which reads a HW register telling the next available HW descriptor. This was working fine when HW descriptors were issued for processing directly in tx_submit(). However, as part of the review process of the driver, a change was requested to move the actual kick-off of HW descriptors processing to ->issue_pending(). Due to this, reading the HW register to know the next available HW descriptor no longer works. So instead of using this HW register, we implemented a software index pointing to the next available HW descriptor. Fixes: 19a340b1a820 ("dmaengine: mv_xor_v2: new driver") Signed-off-by: Thomas Petazzoni Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/mv_xor_v2.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c index 211b8c0e3cfb..4684eceea759 100644 --- a/drivers/dma/mv_xor_v2.c +++ b/drivers/dma/mv_xor_v2.c @@ -161,6 +161,7 @@ struct mv_xor_v2_device { struct mv_xor_v2_sw_desc *sw_desq; int desc_size; unsigned int npendings; + unsigned int hw_queue_idx; }; /** @@ -213,18 +214,6 @@ static void mv_xor_v2_set_data_buffers(struct mv_xor_v2_device *xor_dev, } } -/* - * Return the next available index in the DESQ. - */ -static int mv_xor_v2_get_desq_write_ptr(struct mv_xor_v2_device *xor_dev) -{ - /* read the index for the next available descriptor in the DESQ */ - u32 reg = readl(xor_dev->dma_base + MV_XOR_V2_DMA_DESQ_ALLOC_OFF); - - return ((reg >> MV_XOR_V2_DMA_DESQ_ALLOC_WRPTR_SHIFT) - & MV_XOR_V2_DMA_DESQ_ALLOC_WRPTR_MASK); -} - /* * notify the engine of new descriptors, and update the available index. */ @@ -306,7 +295,6 @@ static irqreturn_t mv_xor_v2_interrupt_handler(int irq, void *data) static dma_cookie_t mv_xor_v2_tx_submit(struct dma_async_tx_descriptor *tx) { - int desq_ptr; void *dest_hw_desc; dma_cookie_t cookie; struct mv_xor_v2_sw_desc *sw_desc = @@ -322,15 +310,15 @@ mv_xor_v2_tx_submit(struct dma_async_tx_descriptor *tx) spin_lock_bh(&xor_dev->lock); cookie = dma_cookie_assign(tx); - /* get the next available slot in the DESQ */ - desq_ptr = mv_xor_v2_get_desq_write_ptr(xor_dev); - /* copy the HW descriptor from the SW descriptor to the DESQ */ - dest_hw_desc = xor_dev->hw_desq_virt + desq_ptr; + dest_hw_desc = xor_dev->hw_desq_virt + xor_dev->hw_queue_idx; memcpy(dest_hw_desc, &sw_desc->hw_desc, xor_dev->desc_size); xor_dev->npendings++; + xor_dev->hw_queue_idx++; + if (xor_dev->hw_queue_idx >= MV_XOR_V2_DESC_NUM) + xor_dev->hw_queue_idx = 0; spin_unlock_bh(&xor_dev->lock); From eb5afaba617739df3fea2d6312cf493822d14934 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 May 2017 11:57:49 +0200 Subject: [PATCH 159/235] dmaengine: mv_xor_v2: remove interrupt coalescing commit 9dd4f319bac25334a869d9276b19eac9e478fd33 upstream. The current implementation of interrupt coalescing doesn't work, because it doesn't configure the coalescing timer, which is needed to make sure we get an interrupt at some point. As a fix for stable, we simply remove the interrupt coalescing functionality. It will be re-introduced properly in a future commit. Fixes: 19a340b1a820 ("dmaengine: mv_xor_v2: new driver") Signed-off-by: Thomas Petazzoni Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/mv_xor_v2.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c index 4684eceea759..b133fe29d788 100644 --- a/drivers/dma/mv_xor_v2.c +++ b/drivers/dma/mv_xor_v2.c @@ -246,22 +246,6 @@ static int mv_xor_v2_set_desc_size(struct mv_xor_v2_device *xor_dev) return MV_XOR_V2_EXT_DESC_SIZE; } -/* - * Set the IMSG threshold - */ -static inline -void mv_xor_v2_set_imsg_thrd(struct mv_xor_v2_device *xor_dev, int thrd_val) -{ - u32 reg; - - reg = readl(xor_dev->dma_base + MV_XOR_V2_DMA_IMSG_THRD_OFF); - - reg &= (~MV_XOR_V2_DMA_IMSG_THRD_MASK << MV_XOR_V2_DMA_IMSG_THRD_SHIFT); - reg |= (thrd_val << MV_XOR_V2_DMA_IMSG_THRD_SHIFT); - - writel(reg, xor_dev->dma_base + MV_XOR_V2_DMA_IMSG_THRD_OFF); -} - static irqreturn_t mv_xor_v2_interrupt_handler(int irq, void *data) { struct mv_xor_v2_device *xor_dev = data; @@ -277,12 +261,6 @@ static irqreturn_t mv_xor_v2_interrupt_handler(int irq, void *data) if (!ndescs) return IRQ_NONE; - /* - * Update IMSG threshold, to disable new IMSG interrupts until - * end of the tasklet - */ - mv_xor_v2_set_imsg_thrd(xor_dev, MV_XOR_V2_DESC_NUM); - /* schedule a tasklet to handle descriptors callbacks */ tasklet_schedule(&xor_dev->irq_tasklet); @@ -607,9 +585,6 @@ static void mv_xor_v2_tasklet(unsigned long data) /* free the descriptores */ mv_xor_v2_free_desc_from_desq(xor_dev, num_of_pending); } - - /* Update IMSG threshold, to enable new IMSG interrupts */ - mv_xor_v2_set_imsg_thrd(xor_dev, 0); } /* From 1f67d28d2707dc1da24c7f94b0a80721ea89abac Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 5 May 2017 11:57:50 +0200 Subject: [PATCH 160/235] dmaengine: mv_xor_v2: set DMA mask to 40 bits commit b2d3c270f9f2fb82518ac500a9849c3aaf503852 upstream. The XORv2 engine on Armada 7K/8K can only access the first 40 bits of the physical address space, so the DMA mask must be set accordingly. Fixes: 19a340b1a820 ("dmaengine: mv_xor_v2: new driver") Signed-off-by: Thomas Petazzoni Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/mv_xor_v2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c index b133fe29d788..f3e211f8f6c5 100644 --- a/drivers/dma/mv_xor_v2.c +++ b/drivers/dma/mv_xor_v2.c @@ -693,6 +693,10 @@ static int mv_xor_v2_probe(struct platform_device *pdev) platform_set_drvdata(pdev, xor_dev); + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(40)); + if (ret) + return ret; + xor_dev->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(xor_dev->clk) && PTR_ERR(xor_dev->clk) == -EPROBE_DEFER) return -EPROBE_DEFER; From 08229c119c426fc8666f1dc2822a8c38fc77fa6a Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 1 Mar 2017 09:02:33 +0800 Subject: [PATCH 161/235] cfq-iosched: fix the delay of cfq_group's vdisktime under iops mode commit 5be6b75610cefd1e21b98a218211922c2feb6e08 upstream. When adding a cfq_group into the cfq service tree, we use CFQ_IDLE_DELAY as the delay of cfq_group's vdisktime if there have been other cfq_groups already. When cfq is under iops mode, commit 9a7f38c42c2b ("cfq-iosched: Convert from jiffies to nanoseconds") could result in a large iops delay and lead to an abnormal io schedule delay for the added cfq_group. To fix it, we just need to revert to the old CFQ_IDLE_DELAY value: HZ / 5 when iops mode is enabled. Despite having the same value, the delay of a cfq_queue in idle class and the delay of cfq_group are different things, so I define two new macros for the delay of a cfq_group under time-slice mode and iops mode. Fixes: 9a7f38c42c2b ("cfq-iosched: Convert from jiffies to nanoseconds") Signed-off-by: Hou Tao Acked-by: Jan Kara Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/cfq-iosched.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3ab6807773ee..c7c3d4e6bc27 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -36,9 +36,13 @@ static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */ static const int cfq_hist_divisor = 4; /* - * offset from end of service tree + * offset from end of queue service tree for idle class */ #define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5) +/* offset from end of group service tree under time slice mode */ +#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5) +/* offset from end of group service under IOPS mode */ +#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5) /* * below this threshold, we consider thinktime immediate @@ -1370,6 +1374,14 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) cfqg->vfraction = max_t(unsigned, vfr, 1); } +static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd) +{ + if (!iops_mode(cfqd)) + return CFQ_SLICE_MODE_GROUP_DELAY; + else + return CFQ_IOPS_MODE_GROUP_DELAY; +} + static void cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) { @@ -1389,7 +1401,8 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) n = rb_last(&st->rb); if (n) { __cfqg = rb_entry_cfqg(n); - cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; + cfqg->vdisktime = __cfqg->vdisktime + + cfq_get_cfqg_vdisktime_delay(cfqd); } else cfqg->vdisktime = st->min_vdisktime; cfq_group_service_tree_add(st, cfqg); From 9636c086532baf9b042b7b7af809cb79e7d46e92 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 31 May 2017 14:03:57 +0100 Subject: [PATCH 162/235] xen/privcmd: Support correctly 64KB page granularity when mapping memory commit 753c09b5652bb4fe53e2db648002ec64b32b8827 upstream. Commit 5995a68 "xen/privcmd: Add support for Linux 64KB page granularity" did not go far enough to support 64KB in mmap_batch_fn. The variable 'nr' is the number of 4KB chunk to map. However, when Linux is using 64KB page granularity the array of pages (vma->vm_private_data) contain one page per 64KB. Fix it by incrementing st->index correctly. Furthermore, st->va is not correctly incremented as PAGE_SIZE != XEN_PAGE_SIZE. Fixes: 5995a68 ("xen/privcmd: Add support for Linux 64KB page granularity") Reported-by: Feng Kan Signed-off-by: Julien Grall Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/xen/privcmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 702040fe2001..0e6061496972 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -335,8 +335,8 @@ static int mmap_batch_fn(void *data, int nr, void *state) st->global_error = 1; } } - st->va += PAGE_SIZE * nr; - st->index += nr; + st->va += XEN_PAGE_SIZE * nr; + st->index += nr / XEN_PFN_PER_PAGE; return 0; } From 9850844e0a0e6a7e419097aa545cc1f4e849d5ea Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 21 May 2017 22:33:23 -0400 Subject: [PATCH 163/235] ext4: fix SEEK_HOLE commit 7d95eddf313c88b24f99d4ca9c2411a4b82fef33 upstream. Currently, SEEK_HOLE implementation in ext4 may both return that there's a hole at some offset although that offset already has data and skip some holes during a search for the next hole. The first problem is demostrated by: xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "seek -h 0" file wrote 57344/57344 bytes at offset 0 56 KiB, 14 ops; 0.0000 sec (2.054 GiB/sec and 538461.5385 ops/sec) Whence Result HOLE 0 Where we can see that SEEK_HOLE wrongly returned offset 0 as containing a hole although we have written data there. The second problem can be demonstrated by: xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "pwrite 128k 8k" -c "seek -h 0" file wrote 57344/57344 bytes at offset 0 56 KiB, 14 ops; 0.0000 sec (1.978 GiB/sec and 518518.5185 ops/sec) wrote 8192/8192 bytes at offset 131072 8 KiB, 2 ops; 0.0000 sec (2 GiB/sec and 500000.0000 ops/sec) Whence Result HOLE 139264 Where we can see that hole at offsets 56k..128k has been ignored by the SEEK_HOLE call. The underlying problem is in the ext4_find_unwritten_pgoff() which is just buggy. In some cases it fails to update returned offset when it finds a hole (when no pages are found or when the first found page has higher index than expected), in some cases conditions for detecting hole are just missing (we fail to detect a situation where indices of returned pages are not contiguous). Fix ext4_find_unwritten_pgoff() to properly detect non-contiguous page indices and also handle all cases where we got less pages then expected in one place and handle it properly there. Fixes: c8c0df241cc2719b1262e627f999638411934f60 CC: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/file.c | 50 ++++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2a822d30e73f..9e77c089e8cb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -432,47 +432,27 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); - if (nr_pages == 0) { - if (whence == SEEK_DATA) - break; - - BUG_ON(whence != SEEK_HOLE); - /* - * If this is the first time to go into the loop and - * offset is not beyond the end offset, it will be a - * hole at this offset - */ - if (lastoff == startoff || lastoff < endoff) - found = 1; + if (nr_pages == 0) break; - } - - /* - * If this is the first time to go into the loop and - * offset is smaller than the first page offset, it will be a - * hole at this offset. - */ - if (lastoff == startoff && whence == SEEK_HOLE && - lastoff < page_offset(pvec.pages[0])) { - found = 1; - break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; struct buffer_head *bh, *head; /* - * If the current offset is not beyond the end of given - * range, it will be a hole. + * If current offset is smaller than the page offset, + * there is a hole at this offset. */ - if (lastoff < endoff && whence == SEEK_HOLE && - page->index > end) { + if (whence == SEEK_HOLE && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { found = 1; *offset = lastoff; goto out; } + if (page->index > end) + goto out; + lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -512,20 +492,18 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); } - /* - * The no. of pages is less than our desired, that would be a - * hole in there. - */ - if (nr_pages < num && whence == SEEK_HOLE) { - found = 1; - *offset = lastoff; + /* The no. of pages is less than our desired, we are done. */ + if (nr_pages < num) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + if (whence == SEEK_HOLE && lastoff < endoff) { + found = 1; + *offset = lastoff; + } out: pagevec_release(&pvec); return found; From 9890b9cb75c9f661546c81098ec49fbfc24abc9c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 21 May 2017 22:36:23 -0400 Subject: [PATCH 164/235] ext4: keep existing extra fields when inode expands commit 887a9730614727c4fff7cb756711b190593fc1df upstream. ext4_expand_extra_isize() should clear only space between old and new size. Fixes: 6dd4ee7cab7e # v2.6.23 Signed-off-by: Konstantin Khlebnikov Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 01329688fb9e..6a6fa1fca8cf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5441,8 +5441,9 @@ static int ext4_expand_extra_isize(struct inode *inode, /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { - memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, - new_extra_isize); + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize, 0, + new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } From 2e16921d1743db8661d206cdc09af23fdaa1a1a7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 26 May 2017 17:40:52 -0400 Subject: [PATCH 165/235] ext4: fix data corruption with EXT4_GET_BLOCKS_ZERO commit 4f8caa60a5a13a78f26198618f21774bd6aa6498 upstream. When ext4_map_blocks() is called with EXT4_GET_BLOCKS_ZERO to zero-out allocated blocks and these blocks are actually converted from unwritten extent the following race can happen: CPU0 CPU1 page fault page fault ... ... ext4_map_blocks() ext4_ext_map_blocks() ext4_ext_handle_unwritten_extents() ext4_ext_convert_to_initialized() - zero out converted extent ext4_zeroout_es() - inserts extent as initialized in status tree ext4_map_blocks() ext4_es_lookup_extent() - finds initialized extent write data ext4_issue_zeroout() - zeroes out new extent overwriting data This problem can be reproduced by generic/340 for the fallocated case for the last block in the file. Fix the problem by avoiding zeroing out the area we are mapping with ext4_map_blocks() in ext4_ext_convert_to_initialized(). It is pointless to zero out this area in the first place as the caller asked us to convert the area to initialized because he is just going to write data there before the transaction finishes. To achieve this we delete the special case of zeroing out full extent as that will be handled by the cases below zeroing only the part of the extent that needs it. We also instruct ext4_split_extent() that the middle of extent being split contains data so that ext4_split_extent_at() cannot zero out full extent in case of ENOSPC. Fixes: 12735f881952c32b31bc4e433768f18489f79ec9 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 80 ++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9fbf92ca358c..01ab957f5e09 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3413,13 +3413,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; - struct ext4_extent zero_ex; + struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; - int split_flag = 0; + int split_flag = EXT4_EXT_DATA_VALID2; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -3436,7 +3436,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - zero_ex.ee_len = 0; + zero_ex1.ee_len = 0; + zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3576,62 +3577,52 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (ext4_encrypted_inode(inode)) max_zeroout = 0; - /* If extent is less than s_max_zeroout_kb, zeroout directly */ - if (max_zeroout && (ee_len <= max_zeroout)) { - err = ext4_ext_zeroout(inode, ex); - if (err) - goto out; - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); - ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - goto out; - } - /* - * four cases: + * five cases: * 1. split the extent into three extents. - * 2. split the extent into two extents, zeroout the first half. - * 3. split the extent into two extents, zeroout the second half. + * 2. split the extent into two extents, zeroout the head of the first + * extent. + * 3. split the extent into two extents, zeroout the tail of the second + * extent. * 4. split the extent into two extents with out zeroout. + * 5. no splitting needed, just possibly zeroout the head and / or the + * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (max_zeroout && (allocated > map->m_len)) { + if (max_zeroout && (allocated > split_map.m_len)) { if (allocated <= max_zeroout) { - /* case 3 */ - zero_ex.ee_block = - cpu_to_le32(map->m_lblk); - zero_ex.ee_len = cpu_to_le16(allocated); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex) + map->m_lblk - ee_block); - err = ext4_ext_zeroout(inode, &zero_ex); + /* case 3 or 5 */ + zero_ex1.ee_block = + cpu_to_le32(split_map.m_lblk + + split_map.m_len); + zero_ex1.ee_len = + cpu_to_le16(allocated - split_map.m_len); + ext4_ext_store_pblock(&zero_ex1, + ext4_ext_pblock(ex) + split_map.m_lblk + + split_map.m_len - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto out; - split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { - /* case 2 */ - if (map->m_lblk != ee_block) { - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(map->m_lblk - + } + if (split_map.m_lblk - ee_block + split_map.m_len < + max_zeroout) { + /* case 2 or 5 */ + if (split_map.m_lblk != ee_block) { + zero_ex2.ee_block = ex->ee_block; + zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); - ext4_ext_store_pblock(&zero_ex, + ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); - err = ext4_ext_zeroout(inode, &zero_ex); + err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto out; } + split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; - split_map.m_len = map->m_lblk - ee_block + map->m_len; allocated = map->m_len; } } @@ -3642,8 +3633,11 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = 0; out: /* If we have gotten a failure, don't zero out status tree */ - if (!err) - err = ext4_zeroout_es(inode, &zero_ex); + if (!err) { + err = ext4_zeroout_es(inode, &zero_ex1); + if (!err) + err = ext4_zeroout_es(inode, &zero_ex2); + } return err ? err : allocated; } From c404f0dee7a82c08eda68aa14350072ae189a5ed Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 May 2017 13:24:55 -0400 Subject: [PATCH 166/235] ext4: fix fdatasync(2) after extent manipulation operations commit 67a7d5f561f469ad2fa5154d2888258ab8e6df7c upstream. Currently, extent manipulation operations such as hole punch, range zeroing, or extent shifting do not record the fact that file data has changed and thus fdatasync(2) has a work to do. As a result if we crash e.g. after a punch hole and fdatasync, user can still possibly see the punched out data after journal replay. Test generic/392 fails due to these problems. Fix the problem by properly marking that file data has changed in these operations. Fixes: a4bb6b64e39abc0e41ca077725f2a72c868e7622 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 5 +++++ fs/ext4/inode.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 01ab957f5e09..a3e0b3b7441d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4887,6 +4887,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); @@ -5573,6 +5575,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); @@ -5746,6 +5749,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6a6fa1fca8cf..8a34d8c0c05f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4044,6 +4044,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: From 1308eeec2fc5608370a469d96795982a6712b22b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 1 Jun 2017 13:54:30 +0200 Subject: [PATCH 167/235] drm: Fix oops + Xserver hang when unplugging USB drm devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 75fb636324a839c2c31be9f81644034c6142e469 upstream. commit a39be606f99d ("drm: Do a full device unregister when unplugging") causes backtraces like this one when unplugging an usb drm device while it is in use: usb 2-3: USB disconnect, device number 25 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 242 at drivers/gpu/drm/drm_mode_config.c:424 drm_mode_config_cleanup+0x220/0x280 [drm] ... RIP: 0010:drm_mode_config_cleanup+0x220/0x280 [drm] ... Call Trace: gm12u320_modeset_cleanup+0xe/0x10 [gm12u320] gm12u320_driver_unload+0x35/0x70 [gm12u320] drm_dev_unregister+0x3c/0xe0 [drm] drm_unplug_dev+0x12/0x60 [drm] gm12u320_usb_disconnect+0x36/0x40 [gm12u320] usb_unbind_interface+0x72/0x280 device_release_driver_internal+0x158/0x210 device_release_driver+0x12/0x20 bus_remove_device+0x104/0x180 device_del+0x1d2/0x350 usb_disable_device+0x9f/0x270 usb_disconnect+0xc6/0x260 ... [drm:drm_mode_config_cleanup [drm]] *ERROR* connector Unknown-1 leaked! ------------[ cut here ]------------ WARNING: CPU: 0 PID: 242 at drivers/gpu/drm/drm_mode_config.c:458 drm_mode_config_cleanup+0x268/0x280 [drm] ... ---[ end trace 80df975dae439ed6 ]--- general protection fault: 0000 [#1] SMP ... Call Trace: ? __switch_to+0x225/0x450 drm_mode_rmfb_work_fn+0x55/0x70 [drm] process_one_work+0x193/0x3c0 worker_thread+0x4a/0x3a0 ... RIP: drm_framebuffer_remove+0x62/0x3f0 [drm] RSP: ffffb776c39dfd98 ---[ end trace 80df975dae439ed7 ]--- After which the system is unusable this is caused by drm_dev_unregister getting called immediately on unplug, which calls the drivers unload function which calls drm_mode_config_cleanup which removes the framebuffer object while userspace is still holding a reference to it. Reverting commit a39be606f99d ("drm: Do a full device unregister when unplugging") leads to the following oops on unplug instead, when userspace closes the last fd referencing the drm_dev: sysfs group 'power' not found for kobject 'card1-Unknown-1' ------------[ cut here ]------------ WARNING: CPU: 0 PID: 2459 at fs/sysfs/group.c:237 sysfs_remove_group+0x80/0x90 ... RIP: 0010:sysfs_remove_group+0x80/0x90 ... Call Trace: dpm_sysfs_remove+0x57/0x60 device_del+0xfd/0x350 device_unregister+0x1a/0x60 drm_sysfs_connector_remove+0x39/0x50 [drm] drm_connector_unregister+0x5a/0x70 [drm] drm_connector_unregister_all+0x45/0xa0 [drm] drm_modeset_unregister_all+0x12/0x30 [drm] drm_dev_unregister+0xca/0xe0 [drm] drm_put_dev+0x32/0x60 [drm] drm_release+0x2f3/0x380 [drm] __fput+0xdf/0x1e0 ... ---[ end trace ecfb91ac85688bbe ]--- BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 IP: down_write+0x1f/0x40 ... Call Trace: debugfs_remove_recursive+0x55/0x1b0 drm_debugfs_connector_remove+0x21/0x40 [drm] drm_connector_unregister+0x62/0x70 [drm] drm_connector_unregister_all+0x45/0xa0 [drm] drm_modeset_unregister_all+0x12/0x30 [drm] drm_dev_unregister+0xca/0xe0 [drm] drm_put_dev+0x32/0x60 [drm] drm_release+0x2f3/0x380 [drm] __fput+0xdf/0x1e0 ... ---[ end trace ecfb91ac85688bbf ]--- This is caused by the revert moving back to drm_unplug_dev calling drm_minor_unregister which does: device_del(minor->kdev); dev_set_drvdata(minor->kdev, NULL); /* safety belt */ drm_debugfs_cleanup(minor); Causing the sysfs entries to already be removed even though we still have references to them in e.g. drm_connector. Note we must call drm_minor_unregister to notify userspace of the unplug of the device, so calling drm_dev_unregister is not completely wrong the problem is that drm_dev_unregister does too much. This commit fixes drm_unplug_dev by not only reverting commit a39be606f99d ("drm: Do a full device unregister when unplugging") but by also adding a call to drm_modeset_unregister_all before the drm_minor_unregister calls to make sure all sysfs entries are removed before calling device_del(minor->kdev) thereby also fixing the second set of oopses caused by just reverting the commit. Fixes: a39be606f99d ("drm: Do a full device unregister when unplugging") Cc: Chris Wilson Cc: Jeffy Cc: Marco Diego Aurélio Mesquita Reported-by: Marco Diego Aurélio Mesquita Signed-off-by: Hans de Goede Reviewed-by: Chris Wilson Signed-off-by: Sean Paul Link: http://patchwork.freedesktop.org/patch/msgid/20170601115430.4113-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_drv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index 6efdba4993fc..0f2fa9044668 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -379,7 +379,12 @@ EXPORT_SYMBOL(drm_put_dev); void drm_unplug_dev(struct drm_device *dev) { /* for a USB device */ - drm_dev_unregister(dev); + if (drm_core_check_feature(dev, DRIVER_MODESET)) + drm_modeset_unregister_all(dev); + + drm_minor_unregister(dev, DRM_MINOR_PRIMARY); + drm_minor_unregister(dev, DRM_MINOR_RENDER); + drm_minor_unregister(dev, DRM_MINOR_CONTROL); mutex_lock(&drm_global_mutex); From 405ac24a0aecc6c60134900dcb5fa46322b9a9a4 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Thu, 11 May 2017 17:26:48 -0700 Subject: [PATCH 168/235] usb: gadget: f_mass_storage: Serialize wake and sleep execution commit dc9217b69dd6089dcfeb86ed4b3c671504326087 upstream. f_mass_storage has a memorry barrier issue with the sleep and wake functions that can cause a deadlock. This results in intermittent hangs during MSC file transfer. The host will reset the device after receiving no response to resume the transfer. This issue is seen when dwc3 is processing 2 transfer-in-progress events at the same time, invoking completion handlers for CSW and CBW. Also this issue occurs depending on the system timing and latency. To increase the chance to hit this issue, you can force dwc3 driver to wait and process those 2 events at once by adding a small delay (~100us) in dwc3_check_event_buf() whenever the request is for CSW and read the event count again. Avoid debugging with printk and ftrace as extra delays and memory barrier will mask this issue. Scenario which can lead to failure: ----------------------------------- 1) The main thread sleeps and waits for the next command in get_next_command(). 2) bulk_in_complete() wakes up main thread for CSW. 3) bulk_out_complete() tries to wake up the running main thread for CBW. 4) thread_wakeup_needed is not loaded with correct value in sleep_thread(). 5) Main thread goes to sleep again. The pattern is shown below. Note the 2 critical variables. * common->thread_wakeup_needed * bh->state CPU 0 (sleep_thread) CPU 1 (wakeup_thread) ============================== =============================== bh->state = BH_STATE_FULL; smp_wmb(); thread_wakeup_needed = 0; thread_wakeup_needed = 1; smp_rmb(); if (bh->state != BH_STATE_FULL) sleep again ... As pointed out by Alan Stern, this is an R-pattern issue. The issue can be seen when there are two wakeups in quick succession. The thread_wakeup_needed can be overwritten in sleep_thread, and the read of the bh->state maybe reordered before the write to thread_wakeup_needed. This patch applies full memory barrier smp_mb() in both sleep_thread() and wakeup_thread() to ensure the order which the thread_wakeup_needed and bh->state are written and loaded. However, a better solution in the future would be to use wait_queue method that takes care of managing memory barrier between waker and waiter. Acked-by: Alan Stern Signed-off-by: Thinh Nguyen Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_mass_storage.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index 8f3659b65f53..ccd93c9e26ab 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -395,7 +395,11 @@ static int fsg_set_halt(struct fsg_dev *fsg, struct usb_ep *ep) /* Caller must hold fsg->lock */ static void wakeup_thread(struct fsg_common *common) { - smp_wmb(); /* ensure the write of bh->state is complete */ + /* + * Ensure the reading of thread_wakeup_needed + * and the writing of bh->state are completed + */ + smp_mb(); /* Tell the main thread that something has happened */ common->thread_wakeup_needed = 1; if (common->thread_task) @@ -626,7 +630,12 @@ static int sleep_thread(struct fsg_common *common, bool can_freeze) } __set_current_state(TASK_RUNNING); common->thread_wakeup_needed = 0; - smp_rmb(); /* ensure the latest bh->state is visible */ + + /* + * Ensure the writing of thread_wakeup_needed + * and the reading of bh->state are completed + */ + smp_mb(); return rc; } From 59db536f5812707e8e035b2c68590b449c7a0cec Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 24 Apr 2017 12:35:51 +0000 Subject: [PATCH 169/235] usb: chipidea: udc: fix NULL pointer dereference if udc_start failed commit aa1f058d7d9244423b8c5a75b9484b1115df7f02 upstream. Fix below NULL pointer dereference. we set ci->roles[CI_ROLE_GADGET] too early in ci_hdrc_gadget_init(), if udc_start() fails due to some reason, the ci->roles[CI_ROLE_GADGET] check in ci_hdrc_gadget_destroy can't protect us. We fix this issue by only setting ci->roles[CI_ROLE_GADGET] if udc_start() succeed. [ 1.398550] Unable to handle kernel NULL pointer dereference at virtual address 00000000 ... [ 1.448600] PC is at dma_pool_free+0xb8/0xf0 [ 1.453012] LR is at dma_pool_free+0x28/0xf0 [ 2.113369] [] dma_pool_free+0xb8/0xf0 [ 2.118857] [] destroy_eps+0x4c/0x68 [ 2.124165] [] ci_hdrc_gadget_destroy+0x28/0x50 [ 2.130461] [] ci_hdrc_probe+0x588/0x7e8 [ 2.136129] [] platform_drv_probe+0x50/0xb8 [ 2.142066] [] driver_probe_device+0x1fc/0x2a8 [ 2.148270] [] __device_attach_driver+0x9c/0xf8 [ 2.154563] [] bus_for_each_drv+0x58/0x98 [ 2.160317] [] __device_attach+0xc4/0x138 [ 2.166072] [] device_initial_probe+0x10/0x18 [ 2.172185] [] bus_probe_device+0x94/0xa0 [ 2.177940] [] device_add+0x3f0/0x560 [ 2.183337] [] platform_device_add+0x180/0x240 [ 2.189541] [] ci_hdrc_add_device+0x440/0x4f8 [ 2.195654] [] ci_hdrc_usb2_probe+0x13c/0x2d8 [ 2.201769] [] platform_drv_probe+0x50/0xb8 [ 2.207705] [] driver_probe_device+0x1fc/0x2a8 [ 2.213910] [] __driver_attach+0xac/0xb0 [ 2.219575] [] bus_for_each_dev+0x60/0xa0 [ 2.225329] [] driver_attach+0x20/0x28 [ 2.230816] [] bus_add_driver+0x1d0/0x238 [ 2.236571] [] driver_register+0x60/0xf8 [ 2.242237] [] __platform_driver_register+0x44/0x50 [ 2.248891] [] ci_hdrc_usb2_driver_init+0x18/0x20 [ 2.255365] [] do_one_initcall+0x38/0x128 [ 2.261121] [] kernel_init_freeable+0x1ac/0x250 [ 2.267414] [] kernel_init+0x10/0x100 [ 2.272810] [] ret_from_fork+0x10/0x50 Fixes: 3f124d233e97 ("usb: chipidea: add role init and destroy APIs") Signed-off-by: Jisheng Zhang Signed-off-by: Peter Chen Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/udc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c index c9e80ad48fdc..6a15b7250e9c 100644 --- a/drivers/usb/chipidea/udc.c +++ b/drivers/usb/chipidea/udc.c @@ -1987,6 +1987,7 @@ static void udc_id_switch_for_host(struct ci_hdrc *ci) int ci_hdrc_gadget_init(struct ci_hdrc *ci) { struct ci_role_driver *rdrv; + int ret; if (!hw_read(ci, CAP_DCCPARAMS, DCCPARAMS_DC)) return -ENXIO; @@ -1999,7 +2000,10 @@ int ci_hdrc_gadget_init(struct ci_hdrc *ci) rdrv->stop = udc_id_switch_for_host; rdrv->irq = udc_irq; rdrv->name = "gadget"; - ci->roles[CI_ROLE_GADGET] = rdrv; - return udc_start(ci); + ret = udc_start(ci); + if (!ret) + ci->roles[CI_ROLE_GADGET] = rdrv; + + return ret; } From 5404b0c0ea852c8982a3f7ff2c4a86a80e25e840 Mon Sep 17 00:00:00 2001 From: Michael Thalmeier Date: Thu, 18 May 2017 16:14:14 +0200 Subject: [PATCH 170/235] usb: chipidea: debug: check before accessing ci_role commit 0340ff83cd4475261e7474033a381bc125b45244 upstream. ci_role BUGs when the role is >= CI_ROLE_END. Signed-off-by: Michael Thalmeier Signed-off-by: Peter Chen Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/chipidea/debug.c b/drivers/usb/chipidea/debug.c index 6d23eede4d8c..1c31e8a08810 100644 --- a/drivers/usb/chipidea/debug.c +++ b/drivers/usb/chipidea/debug.c @@ -294,7 +294,8 @@ static int ci_role_show(struct seq_file *s, void *data) { struct ci_hdrc *ci = s->private; - seq_printf(s, "%s\n", ci_role(ci)->name); + if (ci->role != CI_ROLE_END) + seq_printf(s, "%s\n", ci_role(ci)->name); return 0; } From c5a8004434f0fba8010c38129e82964dd06d06b2 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Fri, 26 May 2017 23:40:33 -0400 Subject: [PATCH 171/235] staging/lustre/lov: remove set_fs() call from lov_getstripe() commit 0a33252e060e97ed3fbdcec9517672f1e91aaef3 upstream. lov_getstripe() calls set_fs(KERNEL_DS) so that it can handle a struct lov_user_md pointer from user- or kernel-space. This changes the behavior of copy_from_user() on SPARC and may result in a misaligned access exception which in turn oopses the kernel. In fact the relevant argument to lov_getstripe() is never called with a kernel-space pointer and so changing the address limits is unnecessary and so we remove the calls to save, set, and restore the address limits. Signed-off-by: John L. Hammond Reviewed-on: http://review.whamcloud.com/6150 Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3221 Reviewed-by: Andreas Dilger Reviewed-by: Li Wei Signed-off-by: Oleg Drokin Signed-off-by: Greg Kroah-Hartman --- drivers/staging/lustre/lustre/lov/lov_pack.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c index be6e9857ce2a..a6d1cc804647 100644 --- a/drivers/staging/lustre/lustre/lov/lov_pack.c +++ b/drivers/staging/lustre/lustre/lov/lov_pack.c @@ -387,18 +387,10 @@ int lov_getstripe(struct lov_object *obj, struct lov_stripe_md *lsm, struct lov_mds_md *lmmk = NULL; int rc, lmmk_size, lmm_size; int lum_size; - mm_segment_t seg; if (!lsm) return -ENODATA; - /* - * "Switch to kernel segment" to allow copying from kernel space by - * copy_{to,from}_user(). - */ - seg = get_fs(); - set_fs(KERNEL_DS); - /* we only need the header part from user space to get lmm_magic and * lmm_stripe_count, (the header part is common to v1 and v3) */ @@ -478,6 +470,5 @@ int lov_getstripe(struct lov_object *obj, struct lov_stripe_md *lsm, out_free: kfree(lmmk); out: - set_fs(seg); return rc; } From e33679f994db7bf17535fd0179db3dca26ca8bda Mon Sep 17 00:00:00 2001 From: Raveendra Padasalagi Date: Tue, 16 May 2017 12:22:42 +0530 Subject: [PATCH 172/235] iio: adc: bcm_iproc_adc: swap primary and secondary isr handler's commit f7d86ecf83cb66d3c4c6ac4edb1dd50c0919aa2b upstream. The third argument of devm_request_threaded_irq() is the primary handler. It is called in hardirq context and checks whether the interrupt is relevant to the device. If the primary handler returns IRQ_WAKE_THREAD, the secondary handler (a.k.a. handler thread) is scheduled to run in process context. bcm_iproc_adc.c uses the secondary handler as the primary one and the other way around. So this patch fixes the same, along with re-naming the secondary handler and primary handler names properly. Tested on the BCM9583XX iProc SoC based boards. Fixes: 4324c97ecedc ("iio: Add driver for Broadcom iproc-static-adc") Reported-by: Pavel Roskin Signed-off-by: Raveendra Padasalagi Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/bcm_iproc_adc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iio/adc/bcm_iproc_adc.c b/drivers/iio/adc/bcm_iproc_adc.c index 21d38c8af21e..7f4f9c4150e3 100644 --- a/drivers/iio/adc/bcm_iproc_adc.c +++ b/drivers/iio/adc/bcm_iproc_adc.c @@ -143,7 +143,7 @@ static void iproc_adc_reg_dump(struct iio_dev *indio_dev) iproc_adc_dbg_reg(dev, adc_priv, IPROC_SOFT_BYPASS_DATA); } -static irqreturn_t iproc_adc_interrupt_handler(int irq, void *data) +static irqreturn_t iproc_adc_interrupt_thread(int irq, void *data) { u32 channel_intr_status; u32 intr_status; @@ -167,7 +167,7 @@ static irqreturn_t iproc_adc_interrupt_handler(int irq, void *data) return IRQ_NONE; } -static irqreturn_t iproc_adc_interrupt_thread(int irq, void *data) +static irqreturn_t iproc_adc_interrupt_handler(int irq, void *data) { irqreturn_t retval = IRQ_NONE; struct iproc_adc_priv *adc_priv; @@ -181,7 +181,7 @@ static irqreturn_t iproc_adc_interrupt_thread(int irq, void *data) adc_priv = iio_priv(indio_dev); regmap_read(adc_priv->regmap, IPROC_INTERRUPT_STATUS, &intr_status); - dev_dbg(&indio_dev->dev, "iproc_adc_interrupt_thread(),INTRPT_STS:%x\n", + dev_dbg(&indio_dev->dev, "iproc_adc_interrupt_handler(),INTRPT_STS:%x\n", intr_status); intr_channels = (intr_status & IPROC_ADC_INTR_MASK) >> IPROC_ADC_INTR; @@ -566,8 +566,8 @@ static int iproc_adc_probe(struct platform_device *pdev) } ret = devm_request_threaded_irq(&pdev->dev, adc_priv->irqno, - iproc_adc_interrupt_thread, iproc_adc_interrupt_handler, + iproc_adc_interrupt_thread, IRQF_SHARED, "iproc-adc", indio_dev); if (ret) { dev_err(&pdev->dev, "request_irq error %d\n", ret); From 8067c911c5e9025a3beaed225d0498edfdbbf8d3 Mon Sep 17 00:00:00 2001 From: Franziska Naepelt Date: Wed, 17 May 2017 12:41:19 +0200 Subject: [PATCH 173/235] iio: light: ltr501 Fix interchanged als/ps register field commit 7cc3bff4efe6164a0c8163331c8aa55454799f42 upstream. The register mapping for the IIO driver for the Liteon Light and Proximity sensor LTR501 interrupt mode is interchanged (ALS/PS). There is a register called INTERRUPT register (address 0x8F) Bit 0 represents PS measurement trigger. Bit 1 represents ALS measurement trigger. This two bit fields are interchanged within the driver. see datasheet page 24: http://optoelectronics.liteon.com/upload/download/DS86-2012-0006/S_110_LTR-501ALS-01_PrelimDS_ver1%5B1%5D.pdf Signed-off-by: Franziska Naepelt Fixes: 7ac702b3144b6 ("iio: ltr501: Add interrupt support") Acked-by: Peter Meerwald-Stadler Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/light/ltr501.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/light/ltr501.c b/drivers/iio/light/ltr501.c index 3afc53a3d0b6..c298fd86ed86 100644 --- a/drivers/iio/light/ltr501.c +++ b/drivers/iio/light/ltr501.c @@ -74,9 +74,9 @@ static const int int_time_mapping[] = {100000, 50000, 200000, 400000}; static const struct reg_field reg_field_it = REG_FIELD(LTR501_ALS_MEAS_RATE, 3, 4); static const struct reg_field reg_field_als_intr = - REG_FIELD(LTR501_INTR, 0, 0); -static const struct reg_field reg_field_ps_intr = REG_FIELD(LTR501_INTR, 1, 1); +static const struct reg_field reg_field_ps_intr = + REG_FIELD(LTR501_INTR, 0, 0); static const struct reg_field reg_field_als_rate = REG_FIELD(LTR501_ALS_MEAS_RATE, 0, 2); static const struct reg_field reg_field_ps_rate = From bad3b49b01aaafc000f2ffe1302d86ca397491ed Mon Sep 17 00:00:00 2001 From: Matt Ranostay Date: Thu, 27 Apr 2017 00:52:32 -0700 Subject: [PATCH 174/235] iio: proximity: as3935: fix AS3935_INT mask commit 275292d3a3d62670b1b13484707b74e5239b4bb0 upstream. AS3935 interrupt mask has been incorrect so valid lightning events would never trigger an buffer event. Also noise interrupt should be BIT(0). Fixes: 24ddb0e4bba4 ("iio: Add AS3935 lightning sensor support") Signed-off-by: Matt Ranostay Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/proximity/as3935.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/proximity/as3935.c b/drivers/iio/proximity/as3935.c index 020459513384..75ba7c7baa6f 100644 --- a/drivers/iio/proximity/as3935.c +++ b/drivers/iio/proximity/as3935.c @@ -40,9 +40,9 @@ #define AS3935_AFE_PWR_BIT BIT(0) #define AS3935_INT 0x03 -#define AS3935_INT_MASK 0x07 +#define AS3935_INT_MASK 0x0f #define AS3935_EVENT_INT BIT(3) -#define AS3935_NOISE_INT BIT(1) +#define AS3935_NOISE_INT BIT(0) #define AS3935_DATA 0x07 #define AS3935_DATA_MASK 0x3F From 716dd37398adf082a153e2362d7237d8fcb3616f Mon Sep 17 00:00:00 2001 From: Matt Ranostay Date: Thu, 4 May 2017 17:32:19 -0700 Subject: [PATCH 175/235] iio: proximity: as3935: fix iio_trigger_poll issue commit 9122b54f266ddee09654fe3fbc503c1a60f4a01c upstream. Using iio_trigger_poll() can oops when multiple interrupts happen before the first is handled. Use iio_trigger_poll_chained() instead and use the timestamp when processed, since it will be in theory be 2 ms max latency. Fixes: 24ddb0e4bba4 ("iio: Add AS3935 lightning sensor support") Signed-off-by: Matt Ranostay Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/proximity/as3935.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/proximity/as3935.c b/drivers/iio/proximity/as3935.c index 75ba7c7baa6f..268210ea4990 100644 --- a/drivers/iio/proximity/as3935.c +++ b/drivers/iio/proximity/as3935.c @@ -215,7 +215,7 @@ static irqreturn_t as3935_trigger_handler(int irq, void *private) st->buffer[0] = val & AS3935_DATA_MASK; iio_push_to_buffers_with_timestamp(indio_dev, &st->buffer, - pf->timestamp); + iio_get_time_ns(indio_dev)); err_read: iio_trigger_notify_done(indio_dev->trig); @@ -244,7 +244,7 @@ static void as3935_event_work(struct work_struct *work) switch (val) { case AS3935_EVENT_INT: - iio_trigger_poll(st->trig); + iio_trigger_poll_chained(st->trig); break; case AS3935_NOISE_INT: dev_warn(&st->spi->dev, "noise level is too high\n"); From acd8f917396071e9c13d2136b511b0ac4969cda8 Mon Sep 17 00:00:00 2001 From: Pratyush Anand Date: Mon, 29 May 2017 22:08:24 +0300 Subject: [PATCH 176/235] mei: make sysfs modalias format similar as uevent modalias commit 6f9193ec044a8f72d8b6ae94a5c4ab6e8b0f00ca upstream. modprobe is not able to resolve sysfs modalias for mei devices. # cat /sys/class/watchdog/watchdog0/device/watchdog/watchdog0/device/modalias mei::05b79a6f-4628-4d7f-899d-a91514cb32ab: # modprobe --set-version 4.9.6-200.fc25.x86_64 -R mei::05b79a6f-4628-4d7f-899d-a91514cb32ab: modprobe: FATAL: Module mei::05b79a6f-4628-4d7f-899d-a91514cb32ab: not found in directory /lib/modules/4.9.6-200.fc25.x86_64 # cat /lib/modules/4.9.6-200.fc25.x86_64/modules.alias | grep 05b79a6f-4628-4d7f-899d-a91514cb32ab alias mei:*:05b79a6f-4628-4d7f-899d-a91514cb32ab:*:* mei_wdt commit b26864cad1c9 ("mei: bus: add client protocol version to the device alias"), however sysfs modalias is still in formmat mei:S:uuid:*. This patch equates format of uevent and sysfs modalias so that modprobe is able to resolve the aliases. Fixes: commit b26864cad1c9 ("mei: bus: add client protocol version to the device alias") Signed-off-by: Pratyush Anand Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/bus.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c index dbe676de7a19..0c98ed44df05 100644 --- a/drivers/misc/mei/bus.c +++ b/drivers/misc/mei/bus.c @@ -678,8 +678,10 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *a, { struct mei_cl_device *cldev = to_mei_cl_device(dev); const uuid_le *uuid = mei_me_cl_uuid(cldev->me_cl); + u8 version = mei_me_cl_ver(cldev->me_cl); - return scnprintf(buf, PAGE_SIZE, "mei:%s:%pUl:", cldev->name, uuid); + return scnprintf(buf, PAGE_SIZE, "mei:%s:%pUl:%02X:", + cldev->name, uuid, version); } static DEVICE_ATTR_RO(modalias); From 96d7b43b42ba55c55da2e45c9926f105565ab2d8 Mon Sep 17 00:00:00 2001 From: David Arcari Date: Fri, 26 May 2017 11:37:31 -0400 Subject: [PATCH 177/235] cpufreq: cpufreq_register_driver() should return -ENODEV if init fails commit 6c77003677d5f1ce15f26d24360cb66c0bc07bb3 upstream. For a driver that does not set the CPUFREQ_STICKY flag, if all of the ->init() calls fail, cpufreq_register_driver() should return an error. This will prevent the driver from loading. Fixes: ce1bcfe94db8 (cpufreq: check cpufreq_policy_list instead of scanning policies for all CPUs) Signed-off-by: David Arcari Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/cpufreq/cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6153b66139d5..286d4d61bd0b 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2474,6 +2474,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) if (!(cpufreq_driver->flags & CPUFREQ_STICKY) && list_empty(&cpufreq_policy_list)) { /* if all ->init() calls failed, unregister */ + ret = -ENODEV; pr_debug("%s: No CPU initialized for driver %s\n", __func__, driver_data->name); goto err_if_unreg; From 974a4eb16e8442cd2d8c856c799a9043d86aba05 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 11 May 2017 01:07:24 -0700 Subject: [PATCH 178/235] target: Re-add check to reject control WRITEs with overflow data commit 4ff83daa0200affe1894bd33d17bac404e3d78d4 upstream. During v4.3 when the overflow/underflow check was relaxed by commit c72c525022: commit c72c5250224d475614a00c1d7e54a67f77cd3410 Author: Roland Dreier Date: Wed Jul 22 15:08:18 2015 -0700 target: allow underflow/overflow for PR OUT etc. commands to allow underflow/overflow for Windows compliance + FCP, a consequence was to allow control CDBs to process overflow data for iscsi-target with immediate data as well. As per Roland's original change, continue to allow underflow cases for control CDBs to make Windows compliance + FCP happy, but until overflow for control CDBs is supported tree-wide, explicitly reject all control WRITEs with overflow following pre v4.3.y logic. Reported-by: Bart Van Assche Cc: Roland Dreier Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_transport.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index cae4dea6464e..077344cc819f 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1182,15 +1182,28 @@ target_cmd_size_check(struct se_cmd *cmd, unsigned int size) if (cmd->unknown_data_length) { cmd->data_length = size; } else if (size != cmd->data_length) { - pr_warn("TARGET_CORE[%s]: Expected Transfer Length:" + pr_warn_ratelimited("TARGET_CORE[%s]: Expected Transfer Length:" " %u does not match SCSI CDB Length: %u for SAM Opcode:" " 0x%02x\n", cmd->se_tfo->get_fabric_name(), cmd->data_length, size, cmd->t_task_cdb[0]); - if (cmd->data_direction == DMA_TO_DEVICE && - cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) { - pr_err("Rejecting underflow/overflow WRITE data\n"); - return TCM_INVALID_CDB_FIELD; + if (cmd->data_direction == DMA_TO_DEVICE) { + if (cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) { + pr_err_ratelimited("Rejecting underflow/overflow" + " for WRITE data CDB\n"); + return TCM_INVALID_CDB_FIELD; + } + /* + * Some fabric drivers like iscsi-target still expect to + * always reject overflow writes. Reject this case until + * full fabric driver level support for overflow writes + * is introduced tree-wide. + */ + if (size > cmd->data_length) { + pr_err_ratelimited("Rejecting overflow for" + " WRITE control CDB\n"); + return TCM_INVALID_CDB_FIELD; + } } /* * Reject READ_* or WRITE_* with overflow/underflow for From b3a42bb630bd52c583f4cd0c11f2948eb766b16e Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 12 Apr 2017 12:11:58 -0700 Subject: [PATCH 179/235] drm/msm: Expose our reservation object when exporting a dmabuf. commit 43523eba79bda8f5b4c27f8ffe20ea078d20113a upstream. Without this, polling on the dma-buf (and presumably other devices synchronizing against our rendering) would return immediately, even while the BO was busy. Signed-off-by: Eric Anholt Reviewed-by: Daniel Vetter Cc: Rob Clark Cc: linux-arm-msm@vger.kernel.org Cc: freedreno@lists.freedesktop.org Reviewed-by: Rob Clark Signed-off-by: Rob Clark Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/msm/msm_drv.c | 1 + drivers/gpu/drm/msm/msm_drv.h | 1 + drivers/gpu/drm/msm/msm_gem_prime.c | 7 +++++++ 3 files changed, 9 insertions(+) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 46568fc80848..6abf315fd6da 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -801,6 +801,7 @@ static struct drm_driver msm_driver = { .prime_fd_to_handle = drm_gem_prime_fd_to_handle, .gem_prime_export = drm_gem_prime_export, .gem_prime_import = drm_gem_prime_import, + .gem_prime_res_obj = msm_gem_prime_res_obj, .gem_prime_pin = msm_gem_prime_pin, .gem_prime_unpin = msm_gem_prime_unpin, .gem_prime_get_sg_table = msm_gem_prime_get_sg_table, diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index d0da52f2a806..bc98d48c47f8 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -203,6 +203,7 @@ struct sg_table *msm_gem_prime_get_sg_table(struct drm_gem_object *obj); void *msm_gem_prime_vmap(struct drm_gem_object *obj); void msm_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr); int msm_gem_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma); +struct reservation_object *msm_gem_prime_res_obj(struct drm_gem_object *obj); struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev, struct dma_buf_attachment *attach, struct sg_table *sg); int msm_gem_prime_pin(struct drm_gem_object *obj); diff --git a/drivers/gpu/drm/msm/msm_gem_prime.c b/drivers/gpu/drm/msm/msm_gem_prime.c index 60bb290700ce..13403c6da6c7 100644 --- a/drivers/gpu/drm/msm/msm_gem_prime.c +++ b/drivers/gpu/drm/msm/msm_gem_prime.c @@ -70,3 +70,10 @@ void msm_gem_prime_unpin(struct drm_gem_object *obj) if (!obj->import_attach) msm_gem_put_pages(obj); } + +struct reservation_object *msm_gem_prime_res_obj(struct drm_gem_object *obj) +{ + struct msm_gem_object *msm_obj = to_msm_bo(obj); + + return msm_obj->resv; +} From b59ec7072c84c09461d98376a3358879d0034098 Mon Sep 17 00:00:00 2001 From: Sui Chen Date: Tue, 9 May 2017 07:47:22 -0500 Subject: [PATCH 180/235] ahci: Acer SA5-271 SSD Not Detected Fix commit 8bfd174312629866efa535193d9e563768ff4307 upstream. (Correction in this resend: fixed function name acer_sa5_271_workaround; fixed the always-true condition in the function; fixed description.) On the Acer Switch Alpha 12 (model number: SA5-271), the internal SSD may not get detected because the port_map and CAP.nr_ports combination causes the driver to skip the port that is actually connected to the SSD. More specifically, either all SATA ports are identified as DUMMY, or all ports get ``link down'' and never get up again. This problem occurs occasionally. When this problem occurs, CAP may hold a value of 0xC734FF00 or 0xC734FF01 and port_map may hold a value of 0x00 or 0x01. When this problem does not occur, CAP holds a value of 0xC734FF02 and port_map may hold a value of 0x07. Overriding the CAP value to 0xC734FF02 and port_map to 0x7 significantly reduces the occurrence of this problem. Link: https://bugzilla.kernel.org/attachment.cgi?id=253091 Signed-off-by: Sui Chen Tested-by: Damian Ivanov Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- drivers/ata/ahci.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 74f4c662f776..c94038206c3a 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1362,6 +1362,40 @@ static inline void ahci_gtf_filter_workaround(struct ata_host *host) {} #endif +/* + * On the Acer Aspire Switch Alpha 12, sometimes all SATA ports are detected + * as DUMMY, or detected but eventually get a "link down" and never get up + * again. When this happens, CAP.NP may hold a value of 0x00 or 0x01, and the + * port_map may hold a value of 0x00. + * + * Overriding CAP.NP to 0x02 and the port_map to 0x7 will reveal all 3 ports + * and can significantly reduce the occurrence of the problem. + * + * https://bugzilla.kernel.org/show_bug.cgi?id=189471 + */ +static void acer_sa5_271_workaround(struct ahci_host_priv *hpriv, + struct pci_dev *pdev) +{ + static const struct dmi_system_id sysids[] = { + { + .ident = "Acer Switch Alpha 12", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Switch SA5-271") + }, + }, + { } + }; + + if (dmi_check_system(sysids)) { + dev_info(&pdev->dev, "enabling Acer Switch Alpha 12 workaround\n"); + if ((hpriv->saved_cap & 0xC734FF00) == 0xC734FF00) { + hpriv->port_map = 0x7; + hpriv->cap = 0xC734FF02; + } + } +} + #ifdef CONFIG_ARM64 /* * Due to ERRATA#22536, ThunderX needs to handle HOST_IRQ_STAT differently. @@ -1597,6 +1631,10 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) "online status unreliable, applying workaround\n"); } + + /* Acer SA5-271 workaround modifies private_data */ + acer_sa5_271_workaround(hpriv, pdev); + /* CAP.NP sometimes indicate the index of the last enabled * port, at other times, that of the last possible port, so * determining the maximum port number requires looking at From dff4c8bb1397337bc7663447d7f6ccbb3a52f8d9 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 May 2017 09:34:06 -0400 Subject: [PATCH 181/235] cgroup: Prevent kill_css() from being called more than once commit 33c35aa4817864e056fd772230b0c6b552e36ea2 upstream. The kill_css() function may be called more than once under the condition that the css was killed but not physically removed yet followed by the removal of the cgroup that is hosting the css. This patch prevents any harmm from being done when that happens. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup-defs.h | 1 + kernel/cgroup.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5b17de62c962..6fb1c34cf805 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -46,6 +46,7 @@ enum { CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ CSS_VISIBLE = (1 << 3), /* css is visible to userland */ + CSS_DYING = (1 << 4), /* css is dying */ }; /* bits in struct cgroup flags field */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a3d2aad2443f..1fde8eec9529 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5407,6 +5407,11 @@ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); + if (css->flags & CSS_DYING) + return; + + css->flags |= CSS_DYING; + /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. From 5aa8f833ca7880d1f8a03516e6e5ccb45eca7019 Mon Sep 17 00:00:00 2001 From: Ulrik De Bie Date: Wed, 7 Jun 2017 10:30:57 -0700 Subject: [PATCH 182/235] Input: elantech - add Fujitsu Lifebook E546/E557 to force crc_enabled commit 47eb0c8b4d9eb6368941c6a9bb443f00847a46d7 upstream. The Lifebook E546 and E557 touchpad were also not functioning and worked after running: echo "1" > /sys/devices/platform/i8042/serio2/crc_enabled Add them to the list of machines that need this workaround. Signed-off-by: Ulrik De Bie Reviewed-by: Arjan Opmeer Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elantech.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 7826994c45bf..cd834da5934a 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1118,8 +1118,10 @@ static int elantech_get_resolution_v4(struct psmouse *psmouse, * Asus UX32VD 0x361f02 00, 15, 0e clickpad * Avatar AVIU-145A2 0x361f00 ? clickpad * Fujitsu LIFEBOOK E544 0x470f00 d0, 12, 09 2 hw buttons + * Fujitsu LIFEBOOK E546 0x470f00 50, 12, 09 2 hw buttons * Fujitsu LIFEBOOK E547 0x470f00 50, 12, 09 2 hw buttons * Fujitsu LIFEBOOK E554 0x570f01 40, 14, 0c 2 hw buttons + * Fujitsu LIFEBOOK E557 0x570f01 40, 14, 0c 2 hw buttons * Fujitsu T725 0x470f01 05, 12, 09 2 hw buttons * Fujitsu H730 0x570f00 c0, 14, 0c 3 hw buttons (**) * Gigabyte U2442 0x450f01 58, 17, 0c 2 hw buttons @@ -1524,6 +1526,13 @@ static const struct dmi_system_id elantech_dmi_force_crc_enabled[] = { DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E544"), }, }, + { + /* Fujitsu LIFEBOOK E546 does not work with crc_enabled == 0 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"), + DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E546"), + }, + }, { /* Fujitsu LIFEBOOK E547 does not work with crc_enabled == 0 */ .matches = { @@ -1545,6 +1554,13 @@ static const struct dmi_system_id elantech_dmi_force_crc_enabled[] = { DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E556"), }, }, + { + /* Fujitsu LIFEBOOK E557 does not work with crc_enabled == 0 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"), + DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E557"), + }, + }, { /* Fujitsu LIFEBOOK U745 does not work with crc_enabled == 0 */ .matches = { From 829a1cab22c4731489673e4a538a0c6d999af5ac Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 May 2017 12:03:48 -0400 Subject: [PATCH 183/235] cpuset: consider dying css as offline commit 41c25707d21716826e3c1f60967f5550610ec1c9 upstream. In most cases, a cgroup controller don't care about the liftimes of cgroups. For the controller, a css becomes online when ->css_online() is called on it and offline when ->css_offline() is called. However, cpuset is special in that the user interface it exposes cares whether certain cgroups exist or not. Combined with the RCU delay between cgroup removal and css offlining, this can lead to user visible behavior oddities where operations which should succeed after cgroup removals fail for some time period. The effects of cgroup removals are delayed when seen from userland. This patch adds css_is_dying() which tests whether offline is pending and updates is_cpuset_online() so that the function returns false also while offline is pending. This gets rid of the userland visible delays. Signed-off-by: Tejun Heo Reported-by: Daniel Jordan Link: http://lkml.kernel.org/r/327ca1f5-7957-fbb9-9e5f-9ba149d40ba2@oracle.com Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup.h | 20 ++++++++++++++++++++ kernel/cpuset.c | 4 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 307ae63ef262..7620a8bc0493 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -343,6 +343,26 @@ static inline bool css_tryget_online(struct cgroup_subsys_state *css) return true; } +/** + * css_is_dying - test whether the specified css is dying + * @css: target css + * + * Test whether @css is in the process of offlining or already offline. In + * most cases, ->css_online() and ->css_offline() callbacks should be + * enough; however, the actual offline operations are RCU delayed and this + * test returns %true also when @css is scheduled to be offlined. + * + * This is useful, for example, when the use case requires synchronous + * behavior with respect to cgroup removal. cgroup removal schedules css + * offlining but the css can seem alive while the operation is being + * delayed. If the delay affects user visible semantics, this test can be + * used to resolve the situation. + */ +static inline bool css_is_dying(struct cgroup_subsys_state *css) +{ + return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); +} + /** * css_put - put a css reference * @css: target css diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 29f815d2ef7e..24d175d2b62d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -174,9 +174,9 @@ typedef enum { } cpuset_flagbits_t; /* convenient tests for these bits */ -static inline bool is_cpuset_online(const struct cpuset *cs) +static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags); + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) From 61604a2626a3eb1c4f5617fda81d2ff90e409342 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 27 Feb 2017 14:28:32 -0800 Subject: [PATCH 184/235] fs: add i_blocksize() commit 93407472a21b82f39c955ea7787e5bc7da100642 upstream. Replace all 1 << inode->i_blkbits and (1 << inode->i_blkbits) in fs branch. This patch also fixes multiple checkpatch warnings: WARNING: Prefer 'unsigned int' to bare use of 'unsigned' Thanks to Andrew Morton for suggesting more appropriate function instead of macro. [geliangtang@gmail.com: truncate: use i_blocksize()] Link: http://lkml.kernel.org/r/9c8b2cd83c8f5653805d43debde9fa8817e02fc4.1484895804.git.geliangtang@gmail.com Link: http://lkml.kernel.org/r/1481319905-10126-1-git-send-email-fabf@skynet.be Signed-off-by: Fabian Frederick Signed-off-by: Geliang Tang Cc: Alexander Viro Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/block_dev.c | 2 +- fs/btrfs/file.c | 2 +- fs/buffer.c | 12 ++++++------ fs/ceph/addr.c | 2 +- fs/direct-io.c | 2 +- fs/ext4/inode.c | 8 ++++---- fs/ext4/mballoc.c | 2 +- fs/ext4/move_extent.c | 2 +- fs/iomap.c | 4 ++-- fs/jfs/super.c | 4 ++-- fs/mpage.c | 2 +- fs/nfsd/blocklayout.c | 6 +++--- fs/nilfs2/btnode.c | 2 +- fs/nilfs2/inode.c | 4 ++-- fs/nilfs2/mdt.c | 4 ++-- fs/nilfs2/segment.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/ocfs2/file.c | 2 +- fs/orangefs/orangefs-utils.c | 4 ++-- fs/reiserfs/file.c | 2 +- fs/reiserfs/inode.c | 2 +- fs/stat.c | 2 +- fs/udf/inode.c | 2 +- fs/xfs/xfs_aops.c | 16 ++++++++-------- fs/xfs/xfs_file.c | 4 ++-- include/linux/fs.h | 5 +++++ mm/truncate.c | 2 +- 27 files changed, 54 insertions(+), 49 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 2924bddb4a94..07e46b786500 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -713,7 +713,7 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_block_size = (1 << inode->i_blkbits); + bdev->bd_block_size = i_blocksize(inode); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3a14c87d9c92..3286a6e47ff0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2842,7 +2842,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, 1 << inode->i_blkbits, + range->len, i_blocksize(inode), offset + len, &alloc_hint); else btrfs_free_reserved_data_space(inode, range->start, diff --git a/fs/buffer.c b/fs/buffer.c index b205a629001d..5d8f496d624e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2353,7 +2353,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; + unsigned int blocksize = i_blocksize(inode); struct page *page; void *fsdata; pgoff_t index, curidx; @@ -2433,8 +2433,8 @@ int cont_write_begin(struct file *file, struct address_space *mapping, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; - unsigned zerofrom; + unsigned int blocksize = i_blocksize(inode); + unsigned int zerofrom; int err; err = cont_expand_zero(file, mapping, pos, bytes); @@ -2796,7 +2796,7 @@ int nobh_truncate_page(struct address_space *mapping, struct buffer_head map_bh; int err; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); length = offset & (blocksize - 1); /* Block boundary? Nothing to do */ @@ -2874,7 +2874,7 @@ int block_truncate_page(struct address_space *mapping, struct buffer_head *bh; int err; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); length = offset & (blocksize - 1); /* Block boundary? Nothing to do */ @@ -2986,7 +2986,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, struct inode *inode = mapping->host; tmp.b_state = 0; tmp.b_blocknr = 0; - tmp.b_size = 1 << inode->i_blkbits; + tmp.b_size = i_blocksize(inode); get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 18dc18f8af2c..900ffafb85ca 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -745,7 +745,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct pagevec pvec; int done = 0; int rc = 0; - unsigned wsize = 1 << inode->i_blkbits; + unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; int do_sync = 0; loff_t snap_size, i_size; diff --git a/fs/direct-io.c b/fs/direct-io.c index fb9aa16a7727..c60756e89833 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -587,7 +587,7 @@ static int dio_set_defer_completion(struct dio *dio) /* * Call into the fs to map some more disk blocks. We record the current number * of available blocks at sdio->blocks_available. These are in units of the - * fs blocksize, (1 << inode->i_blkbits). + * fs blocksize, i_blocksize(inode). * * The fs is allowed to map lots of blocks at once. If it wants to do that, * it uses the passed inode-relative block number as the file offset, as usual. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8a34d8c0c05f..1b29efcab3dc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2205,7 +2205,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, { struct inode *inode = mpd->inode; int err; - ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; do { @@ -3454,14 +3454,14 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) * writes need zeroing either because they can race with page * faults or because they use partial blocks. */ - if (round_down(offset, 1<i_blkbits) >= inode->i_size && + if (round_down(offset, i_blocksize(inode)) >= inode->i_size && ext4_aligned_io(inode, offset, count)) get_block_func = ext4_dio_get_block; else get_block_func = ext4_dax_get_block; dio_flags = DIO_LOCKING; } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { + round_down(offset, i_blocksize(inode)) >= inode->i_size) { get_block_func = ext4_dio_get_block; dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; } else if (is_sync_kiocb(iocb)) { @@ -5048,7 +5048,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * do. We do the check mainly to optimize the common PAGE_SIZE == * blocksize case */ - if (offset > PAGE_SIZE - (1 << inode->i_blkbits)) + if (offset > PAGE_SIZE - i_blocksize(inode)) return; while (1) { page = find_lock_page(inode->i_mapping, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2e9fc7a61048..846b57ff58de 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -838,7 +838,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) inode = page->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; groups_per_page = blocks_per_page >> 1; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 6fc14def0c70..578f8c33fb44 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -187,7 +187,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (PageUptodate(page)) return 0; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); diff --git a/fs/iomap.c b/fs/iomap.c index 814ae8f9587d..798c291cbc75 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -419,8 +419,8 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, struct iomap_ops *ops) { - unsigned blocksize = (1 << inode->i_blkbits); - unsigned off = pos & (blocksize - 1); + unsigned int blocksize = i_blocksize(inode); + unsigned int off = pos & (blocksize - 1); /* Block boundary? Nothing to do */ if (!off) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 85671f7f8518..14be95bc0bcd 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -758,7 +758,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, sb->s_blocksize - offset : toread; tmp_bh.b_state = 0; - tmp_bh.b_size = 1 << inode->i_blkbits; + tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 0); if (err) return err; @@ -798,7 +798,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; - tmp_bh.b_size = 1 << inode->i_blkbits; + tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 1); if (err) goto out; diff --git a/fs/mpage.c b/fs/mpage.c index d2413af0823a..d2fcb149720d 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -115,7 +115,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) SetPageUptodate(page); return; } - create_empty_buffers(page, 1 << inode->i_blkbits, 0); + create_empty_buffers(page, i_blocksize(inode), 0); } head = page_buffers(page); page_bh = head; diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 0780ff864539..3e396dbb1eb9 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -23,7 +23,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, { struct nfsd4_layout_seg *seg = &args->lg_seg; struct super_block *sb = inode->i_sb; - u32 block_size = (1 << inode->i_blkbits); + u32 block_size = i_blocksize(inode); struct pnfs_block_extent *bex; struct iomap iomap; u32 device_generation = 0; @@ -180,7 +180,7 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, int nr_iomaps; nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + lcp->lc_up_len, &iomaps, i_blocksize(inode)); if (nr_iomaps < 0) return nfserrno(nr_iomaps); @@ -372,7 +372,7 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, int nr_iomaps; nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + lcp->lc_up_len, &iomaps, i_blocksize(inode)); if (nr_iomaps < 0) return nfserrno(nr_iomaps); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index d5c23da43513..c21e0b4454a6 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -50,7 +50,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) brelse(bh); BUG(); } - memset(bh->b_data, 0, 1 << inode->i_blkbits); + memset(bh->b_data, 0, i_blocksize(inode)); bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index c7f4fef9ebf5..7ffe71a8dfb9 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -51,7 +51,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n) { struct nilfs_root *root = NILFS_I(inode)->i_root; - inode_add_bytes(inode, (1 << inode->i_blkbits) * n); + inode_add_bytes(inode, i_blocksize(inode) * n); if (root) atomic64_add(n, &root->blocks_count); } @@ -60,7 +60,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n) { struct nilfs_root *root = NILFS_I(inode)->i_root; - inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); + inode_sub_bytes(inode, i_blocksize(inode) * n); if (root) atomic64_sub(n, &root->blocks_count); } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index d56d3a5bea88..98835ed6bef4 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -57,7 +57,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_mapped(bh); kaddr = kmap_atomic(bh->b_page); - memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); + memset(kaddr + bh_offset(bh), 0, i_blocksize(inode)); if (init_block) init_block(inode, bh, kaddr); flush_dcache_page(bh->b_page); @@ -501,7 +501,7 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size, struct nilfs_mdt_info *mi = NILFS_MDT(inode); mi->mi_entry_size = entry_size; - mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; + mi->mi_entries_per_block = i_blocksize(inode) / entry_size; mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index bedcae2c28e6..7d18d62e8e07 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -723,7 +723,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, lock_page(page); if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); + create_empty_buffers(page, i_blocksize(inode), 0); unlock_page(page); bh = head = page_buffers(page); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b9748ea3..f2961b13e8c5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -599,7 +599,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; unsigned int block_end, block_start; - unsigned int bsize = 1 << inode->i_blkbits; + unsigned int bsize = i_blocksize(inode); if (!page_has_buffers(page)) create_empty_buffers(page, bsize, 0); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 000c234d7bbd..0db6f83fdea1 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -808,7 +808,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* We know that zero_from is block aligned */ for (block_start = zero_from; block_start < zero_to; block_start = block_end) { - block_end = block_start + (1 << inode->i_blkbits); + block_end = block_start + i_blocksize(inode); /* * block_start is block-aligned. Bump it by one to force diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 06af81f71e10..9b96b99539d6 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -306,7 +306,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) break; case S_IFDIR: inode->i_size = PAGE_SIZE; - orangefs_inode->blksize = (1 << inode->i_blkbits); + orangefs_inode->blksize = i_blocksize(inode); spin_lock(&inode->i_lock); inode_set_bytes(inode, inode->i_size); spin_unlock(&inode->i_lock); @@ -316,7 +316,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) if (new) { inode->i_size = (loff_t)strlen(new_op-> downcall.resp.getattr.link_target); - orangefs_inode->blksize = (1 << inode->i_blkbits); + orangefs_inode->blksize = i_blocksize(inode); ret = strscpy(orangefs_inode->link_target, new_op->downcall.resp.getattr.link_target, ORANGEFS_NAME_MAX); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 2f8c5c9bdaf6..b396eb09f288 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -189,7 +189,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, int ret = 0; th.t_trans_id = 0; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); if (logit) { reiserfs_write_lock(s); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 58b2dedb2a3a..bd4c727f4610 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -524,7 +524,7 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, * referenced in convert_tail_for_hole() that may be called from * reiserfs_get_block() */ - bh_result->b_size = (1 << inode->i_blkbits); + bh_result->b_size = i_blocksize(inode); ret = reiserfs_get_block(inode, iblock, bh_result, create | GET_BLOCK_NO_DANGLE); diff --git a/fs/stat.c b/fs/stat.c index bc045c7994e1..293503202cac 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -31,7 +31,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; stat->ctime = inode->i_ctime; - stat->blksize = (1 << inode->i_blkbits); + stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index aad46401ede5..129b18a29c8f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1214,7 +1214,7 @@ int udf_setsize(struct inode *inode, loff_t newsize) { int err; struct udf_inode_info *iinfo; - int bsize = 1 << inode->i_blkbits; + int bsize = i_blocksize(inode); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6df0a7ce3e8a..578981412615 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -108,9 +108,9 @@ xfs_finish_page_writeback( unsigned int bsize; ASSERT(bvec->bv_offset < PAGE_SIZE); - ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); + ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); ASSERT(end < PAGE_SIZE); - ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0); + ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); bh = head = page_buffers(bvec->bv_page); @@ -349,7 +349,7 @@ xfs_map_blocks( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - ssize_t count = 1 << inode->i_blkbits; + ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb, end_fsb; int error = 0; int bmapi_flags = XFS_BMAPI_ENTIRE; @@ -759,7 +759,7 @@ xfs_aops_discard_page( break; } next_buffer: - offset += 1 << inode->i_blkbits; + offset += i_blocksize(inode); } while ((bh = bh->b_this_page) != head); @@ -847,7 +847,7 @@ xfs_writepage_map( LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; struct buffer_head *bh, *head; - ssize_t len = 1 << inode->i_blkbits; + ssize_t len = i_blocksize(inode); int error = 0; int count = 0; int uptodate = 1; @@ -1250,7 +1250,7 @@ xfs_map_trim_size( offset + mapping_size >= i_size_read(inode)) { /* limit mapping to block that spans EOF */ mapping_size = roundup_64(i_size_read(inode) - offset, - 1 << inode->i_blkbits); + i_blocksize(inode)); } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1286,7 +1286,7 @@ __xfs_get_blocks( return -EIO; offset = (xfs_off_t)iblock << inode->i_blkbits; - ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); + ASSERT(bh_result->b_size >= i_blocksize(inode)); size = bh_result->b_size; if (!create && offset >= i_size_read(inode)) @@ -1634,7 +1634,7 @@ xfs_vm_set_page_dirty( if (offset < end_offset) set_buffer_dirty(bh); bh = bh->b_this_page; - offset += 1 << inode->i_blkbits; + offset += i_blocksize(inode); } while (bh != head); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a90ec3fad69f..df206cfc21f7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -823,7 +823,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; if (offset & blksize_mask || len & blksize_mask) { error = -EINVAL; @@ -845,7 +845,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; new_size = i_size_read(inode) + len; if (offset & blksize_mask || len & blksize_mask) { diff --git a/include/linux/fs.h b/include/linux/fs.h index dc0478c07b2a..2f63d44368bd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -705,6 +705,11 @@ struct inode { void *i_private; /* fs or device private pointer */ }; +static inline unsigned int i_blocksize(const struct inode *node) +{ + return (1 << node->i_blkbits); +} + static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); diff --git a/mm/truncate.c b/mm/truncate.c index 8d8c62d89e6d..9c809e7d73c3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -753,7 +753,7 @@ EXPORT_SYMBOL(truncate_setsize); */ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { - int bsize = 1 << inode->i_blkbits; + int bsize = i_blocksize(inode); loff_t rounded_from; struct page *page; pgoff_t index; From 4896c87d246423b0c18e69594115af2db3d956f0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 02:42:03 -0400 Subject: [PATCH 185/235] ufs: restore proper tail allocation commit 8785d84d002c2ce0f68fbcd6c2c86be859802c7e upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/ufs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 190d64be22ed..27f342dc0afb 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -284,7 +284,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, uspi->s_fpb, err, locked_page); + goal, nfrags, err, locked_page); if (!tmp) { *err = -ENOSPC; From bf7bfef3eee3ce45a801678817c06476036f5c6d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 18:15:18 -0400 Subject: [PATCH 186/235] fix ufs_isblockset() commit 414cf7186dbec29bd946c138d6b5c09da5955a08 upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/ufs/util.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ufs/util.h b/fs/ufs/util.h index b7fbf53dbc81..398019fb1448 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -473,15 +473,19 @@ static inline unsigned _ubh_find_last_zero_bit_( static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned block) { + u8 mask; switch (uspi->s_fpb) { case 8: return (*ubh_get_addr (ubh, begin + block) == 0xff); case 4: - return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2))); + mask = 0x0f << ((block & 0x01) << 2); + return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask; case 2: - return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1))); + mask = 0x03 << ((block & 0x03) << 1); + return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask; case 1: - return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07))); + mask = 0x01 << (block & 0x07); + return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask; } return 0; } From aed005fb79560cd84369461252112ce465c86d76 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 21:15:03 -0400 Subject: [PATCH 187/235] ufs: restore maintaining ->i_blocks commit eb315d2ae614493fd1ebb026c75a80573d84f7ad upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/stat.c | 1 + fs/ufs/balloc.c | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/stat.c b/fs/stat.c index 293503202cac..068fdbcc9e26 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -454,6 +454,7 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_bytes -= 512; } } +EXPORT_SYMBOL(__inode_add_bytes); void inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 67e085d591d8..a81b97013021 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -81,7 +81,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ufs_error (sb, "ufs_free_fragments", "bit already cleared for fragment %u", i); } - + + inode_sub_bytes(inode, count << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -183,6 +184,7 @@ do_more: ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); @@ -494,6 +496,20 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return 0; } +static bool try_add_frags(struct inode *inode, unsigned frags) +{ + unsigned size = frags * i_blocksize(inode); + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, size); + if (unlikely((u32)inode->i_blocks != inode->i_blocks)) { + __inode_sub_bytes(inode, size); + spin_unlock(&inode->i_lock); + return false; + } + spin_unlock(&inode->i_lock); + return true; +} + static u64 ufs_add_fragments(struct inode *inode, u64 fragment, unsigned oldcount, unsigned newcount) { @@ -530,6 +546,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, for (i = oldcount; i < newcount; i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) return 0; + + if (!try_add_frags(inode, count)) + return 0; /* * Block can be extended */ @@ -647,6 +666,7 @@ cg_found: ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; + inode_sub_bytes(inode, i << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); @@ -657,6 +677,8 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; + if (!try_add_frags(inode, count)) + return 0; for (i = 0; i < count; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); @@ -716,6 +738,8 @@ norot: return INVBLOCK; ucpi->c_rotor = result; gotit: + if (!try_add_frags(inode, uspi->s_fpb)) + return 0; blkno = ufs_fragstoblks(result); ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) From 55a00f816bd7def1af57961170510edb929db40f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 21:15:45 -0400 Subject: [PATCH 188/235] ufs: set correct ->s_maxsize commit 6b0d144fa758869bdd652c50aa41aaf601232550 upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/ufs/super.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f3469ad0fef2..351162ff1bfd 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -746,6 +746,23 @@ static void ufs_put_super(struct super_block *sb) return; } +static u64 ufs_max_bytes(struct super_block *sb) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int bits = uspi->s_apbshift; + u64 res; + + if (bits > 21) + res = ~0ULL; + else + res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) + + (1LL << (3*bits)); + + if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift)) + return MAX_LFS_FILESIZE; + return res << uspi->s_bshift; +} + static int ufs_fill_super(struct super_block *sb, void *data, int silent) { struct ufs_sb_info * sbi; @@ -1211,6 +1228,7 @@ magic_found: "fast symlink size (%u)\n", uspi->s_maxsymlinklen); uspi->s_maxsymlinklen = maxsymlen; } + sb->s_maxbytes = ufs_max_bytes(sb); sb->s_max_links = UFS_LINK_MAX; inode = ufs_iget(sb, UFS_ROOTINO); From 3d4922b5bb5b8fdc4c57768312bab47450324a00 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 23:27:12 -0400 Subject: [PATCH 189/235] ufs_extend_tail(): fix the braino in calling conventions of ufs_new_fragments() commit 940ef1a0ed939c2ca029fca715e25e7778ce1e34 upstream. ... and it really needs splitting into "new" and "extend" cases, but that's for later Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/ufs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 27f342dc0afb..27c994fb811a 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -235,7 +235,8 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, p = ufs_get_direct_data_ptr(uspi, ufsi, block); tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), - new_size, err, locked_page); + new_size - (lastfrag & uspi->s_fpbmask), err, + locked_page); return tmp != 0; } From 84bef90a454f818edd1abbe6b18d5238e6614971 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 23:28:53 -0400 Subject: [PATCH 190/235] ufs_getfrag_block(): we only grab ->truncate_mutex on block creation path commit 006351ac8ead0d4a67dd3845e3ceffe650a23212 upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/ufs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 27c994fb811a..a2760a2869f4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -403,7 +403,9 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff if (!create) { phys64 = ufs_frag_map(inode, offsets, depth); - goto out; + if (phys64) + map_bh(bh_result, sb, phys64 + frag); + return 0; } /* This code entered only while writing ....? */ From 172c70d1cd82545fdef4717eb3fc8605af2ae749 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Tue, 6 Jun 2017 11:43:41 +0200 Subject: [PATCH 191/235] cxl: Fix error path on bad ioctl commit cec422c11caeeccae709e9942058b6b644ce434c upstream. Fix error path if we can't copy user structure on CXL_IOCTL_START_WORK ioctl. We shouldn't unlock the context status mutex as it was not locked (yet). Fixes: 0712dc7e73e5 ("cxl: Fix issues when unmapping contexts") Signed-off-by: Frederic Barrat Reviewed-by: Vaibhav Jain Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/file.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 77080cc5fa0a..afa211397048 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -155,11 +155,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, /* Do this outside the status_mutex to avoid a circular dependency with * the locking in cxl_mmap_fault() */ - if (copy_from_user(&work, uwork, - sizeof(struct cxl_ioctl_start_work))) { - rc = -EFAULT; - goto out; - } + if (copy_from_user(&work, uwork, sizeof(work))) + return -EFAULT; mutex_lock(&ctx->status_mutex); if (ctx->status != OPENED) { From 3fd1233dabd5dc6d997b654552205d153f3caac4 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Fri, 2 Jun 2017 22:26:48 +0530 Subject: [PATCH 192/235] cxl: Avoid double free_irq() for psl,slice interrupts commit b3aa20ba2ba8072b73bd799605b8c98927b7056c upstream. During an eeh call to cxl_remove can result in double free_irq of psl,slice interrupts. This can happen if perst_reloads_same_image == 1 and call to cxl_configure_adapter() fails during slot_reset callback. In such a case we see a kernel oops with following back-trace: Oops: Kernel access of bad area, sig: 11 [#1] Call Trace: free_irq+0x88/0xd0 (unreliable) cxl_unmap_irq+0x20/0x40 [cxl] cxl_native_release_psl_irq+0x78/0xd8 [cxl] pci_deconfigure_afu+0xac/0x110 [cxl] cxl_remove+0x104/0x210 [cxl] pci_device_remove+0x6c/0x110 device_release_driver_internal+0x204/0x2e0 pci_stop_bus_device+0xa0/0xd0 pci_stop_and_remove_bus_device+0x28/0x40 pci_hp_remove_devices+0xb0/0x150 pci_hp_remove_devices+0x68/0x150 eeh_handle_normal_event+0x140/0x580 eeh_handle_event+0x174/0x360 eeh_event_handler+0x1e8/0x1f0 This patch fixes the issue of double free_irq by checking that variables that hold the virqs (err_hwirq, serr_hwirq, psl_virq) are not '0' before un-mapping and resetting these variables to '0' when they are un-mapped. Signed-off-by: Vaibhav Jain Reviewed-by: Andrew Donnellan Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/native.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index a217a74ccc98..224c7103890c 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -1066,13 +1066,16 @@ int cxl_native_register_psl_err_irq(struct cxl *adapter) void cxl_native_release_psl_err_irq(struct cxl *adapter) { - if (adapter->native->err_virq != irq_find_mapping(NULL, adapter->native->err_hwirq)) + if (adapter->native->err_virq == 0 || + adapter->native->err_virq != + irq_find_mapping(NULL, adapter->native->err_hwirq)) return; cxl_p1_write(adapter, CXL_PSL_ErrIVTE, 0x0000000000000000); cxl_unmap_irq(adapter->native->err_virq, adapter); cxl_ops->release_one_irq(adapter, adapter->native->err_hwirq); kfree(adapter->irq_name); + adapter->native->err_virq = 0; } int cxl_native_register_serr_irq(struct cxl_afu *afu) @@ -1102,13 +1105,15 @@ int cxl_native_register_serr_irq(struct cxl_afu *afu) void cxl_native_release_serr_irq(struct cxl_afu *afu) { - if (afu->serr_virq != irq_find_mapping(NULL, afu->serr_hwirq)) + if (afu->serr_virq == 0 || + afu->serr_virq != irq_find_mapping(NULL, afu->serr_hwirq)) return; cxl_p1n_write(afu, CXL_PSL_SERR_An, 0x0000000000000000); cxl_unmap_irq(afu->serr_virq, afu); cxl_ops->release_one_irq(afu->adapter, afu->serr_hwirq); kfree(afu->err_irq_name); + afu->serr_virq = 0; } int cxl_native_register_psl_irq(struct cxl_afu *afu) @@ -1131,12 +1136,15 @@ int cxl_native_register_psl_irq(struct cxl_afu *afu) void cxl_native_release_psl_irq(struct cxl_afu *afu) { - if (afu->native->psl_virq != irq_find_mapping(NULL, afu->native->psl_hwirq)) + if (afu->native->psl_virq == 0 || + afu->native->psl_virq != + irq_find_mapping(NULL, afu->native->psl_hwirq)) return; cxl_unmap_irq(afu->native->psl_virq, afu); cxl_ops->release_one_irq(afu->adapter, afu->native->psl_hwirq); kfree(afu->psl_irq_name); + afu->native->psl_virq = 0; } static void recover_psl_err(struct cxl_afu *afu, u64 errstat) From 4d15ab90ec2bece7361b6afebebfbf83a381a529 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 12 May 2017 01:03:52 +0200 Subject: [PATCH 193/235] btrfs: use correct types for page indices in btrfs_page_exists_in_range commit cc2b702c52094b637a351d7491ac5200331d0445 upstream. Variables start_idx and end_idx are supposed to hold a page index derived from the file offsets. The int type is not the right one though, offsets larger than 1 << 44 will get silently trimmed off the high bits. (1 << 44 is 16TiB) What can go wrong, if start is below the boundary and end gets trimmed: - if there's a page after start, we'll find it (radix_tree_gang_lookup_slot) - the final check "if (page->index <= end_idx)" will unexpectedly fail The function will return false, ie. "there's no page in the range", although there is at least one. btrfs_page_exists_in_range is used to prevent races in: * in hole punching, where we make sure there are not pages in the truncated range, otherwise we'll wait for them to finish and redo truncation, but we're going to replace the pages with holes anyway so the only problem is the intermediate state * lock_extent_direct: we want to make sure there are no pages before we lock and start DIO, to prevent stale data reads For practical occurence of the bug, there are several constaints. The file must be quite large, the affected range must cross the 16TiB boundary and the internal state of the file pages and pending operations must match. Also, we must not have started any ordered data in the range, otherwise we don't even reach the buggy function check. DIO locking tries hard in several places to avoid deadlocks with buffered IO and avoids waiting for ranges. The worst consequence seems to be stale data read. CC: Liu Bo Fixes: fc4adbff823f7 ("btrfs: Drop EXTENT_UPTODATE check in hole punching and direct locking") Reviewed-by: Liu Bo Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index be4da91d880f..bddbae796941 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7435,8 +7435,8 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) int found = false; void **pagep = NULL; struct page *page = NULL; - int start_idx; - int end_idx; + unsigned long start_idx; + unsigned long end_idx; start_idx = start >> PAGE_SHIFT; From 66d6448475c682099e4dd3c6b4ef6e94248c0b2e Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 17 May 2017 09:49:37 -0400 Subject: [PATCH 194/235] btrfs: fix memory leak in update_space_info failure path commit 896533a7da929136d0432713f02a3edffece2826 upstream. If we fail to add the space_info kobject, we'll leak the memory for the percpu counter. Fixes: 6ab0a2029c (btrfs: publish allocation data in sysfs) Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5909ae8c6731..e46e7fbe1b34 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3984,6 +3984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, info->space_info_kobj, "%s", alloc_name(found->flags)); if (ret) { + percpu_counter_destroy(&found->total_bytes_pinned); kfree(found); return ret; } From f75e09ebd3e414e3a1be5a11492702346d102489 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 5 Jun 2017 19:17:18 +0100 Subject: [PATCH 195/235] KVM: arm/arm64: Handle possible NULL stage2 pud when ageing pages commit d6dbdd3c8558cad3b6d74cc357b408622d122331 upstream. Under memory pressure, we start ageing pages, which amounts to parsing the page tables. Since we don't want to allocate any extra level, we pass NULL for our private allocation cache. Which means that stage2_get_pud() is allowed to fail. This results in the following splat: [ 1520.409577] Unable to handle kernel NULL pointer dereference at virtual address 00000008 [ 1520.417741] pgd = ffff810f52fef000 [ 1520.421201] [00000008] *pgd=0000010f636c5003, *pud=0000010f56f48003, *pmd=0000000000000000 [ 1520.429546] Internal error: Oops: 96000006 [#1] PREEMPT SMP [ 1520.435156] Modules linked in: [ 1520.438246] CPU: 15 PID: 53550 Comm: qemu-system-aar Tainted: G W 4.12.0-rc4-00027-g1885c397eaec #7205 [ 1520.448705] Hardware name: FOXCONN R2-1221R-A4/C2U4N_MB, BIOS G31FB12A 10/26/2016 [ 1520.463726] task: ffff800ac5fb4e00 task.stack: ffff800ce04e0000 [ 1520.469666] PC is at stage2_get_pmd+0x34/0x110 [ 1520.474119] LR is at kvm_age_hva_handler+0x44/0xf0 [ 1520.478917] pc : [] lr : [] pstate: 40000145 [ 1520.486325] sp : ffff800ce04e33d0 [ 1520.489644] x29: ffff800ce04e33d0 x28: 0000000ffff40064 [ 1520.494967] x27: 0000ffff27e00000 x26: 0000000000000000 [ 1520.500289] x25: ffff81051ba65008 x24: 0000ffff40065000 [ 1520.505618] x23: 0000ffff40064000 x22: 0000000000000000 [ 1520.510947] x21: ffff810f52b20000 x20: 0000000000000000 [ 1520.516274] x19: 0000000058264000 x18: 0000000000000000 [ 1520.521603] x17: 0000ffffa6fe7438 x16: ffff000008278b70 [ 1520.526940] x15: 000028ccd8000000 x14: 0000000000000008 [ 1520.532264] x13: ffff7e0018298000 x12: 0000000000000002 [ 1520.537582] x11: ffff000009241b93 x10: 0000000000000940 [ 1520.542908] x9 : ffff0000092ef800 x8 : 0000000000000200 [ 1520.548229] x7 : ffff800ce04e36a8 x6 : 0000000000000000 [ 1520.553552] x5 : 0000000000000001 x4 : 0000000000000000 [ 1520.558873] x3 : 0000000000000000 x2 : 0000000000000008 [ 1520.571696] x1 : ffff000008fd5000 x0 : ffff0000080b149c [ 1520.577039] Process qemu-system-aar (pid: 53550, stack limit = 0xffff800ce04e0000) [...] [ 1521.510735] [] stage2_get_pmd+0x34/0x110 [ 1521.516221] [] kvm_age_hva_handler+0x44/0xf0 [ 1521.522054] [] handle_hva_to_gpa+0xb8/0xe8 [ 1521.527716] [] kvm_age_hva+0x44/0xf0 [ 1521.532854] [] kvm_mmu_notifier_clear_flush_young+0x70/0xc0 [ 1521.539992] [] __mmu_notifier_clear_flush_young+0x88/0xd0 [ 1521.546958] [] page_referenced_one+0xf0/0x188 [ 1521.552881] [] rmap_walk_anon+0xec/0x250 [ 1521.558370] [] rmap_walk+0x78/0xa0 [ 1521.563337] [] page_referenced+0x164/0x180 [ 1521.569002] [] shrink_active_list+0x178/0x3b8 [ 1521.574922] [] shrink_node_memcg+0x328/0x600 [ 1521.580758] [] shrink_node+0xc4/0x328 [ 1521.585986] [] do_try_to_free_pages+0xc0/0x340 [ 1521.592000] [] try_to_free_pages+0xcc/0x240 [...] The trivial fix is to handle this NULL pud value early, rather than dereferencing it blindly. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- arch/arm/kvm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2fd5c135e8a4..332ce3b5a34f 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -872,6 +872,9 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); + if (!pud) + return NULL; + if (stage2_pud_none(*pud)) { if (!cache) return NULL; From fe42472e53ebceec5fbb8611a4b4946c1a2e5c01 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 23 May 2017 16:50:47 +0200 Subject: [PATCH 196/235] scsi: qla2xxx: don't disable a not previously enabled PCI device commit ddff7ed45edce4a4c92949d3c61cd25d229c4a14 upstream. When pci_enable_device() or pci_enable_device_mem() fail in qla2x00_probe_one() we bail out but do a call to pci_disable_device(). This causes the dev_WARN_ON() in pci_disable_device() to trigger, as the device wasn't enabled previously. So instead of taking the 'probe_out' error path we can directly return *iff* one of the pci_enable_device() calls fails. Additionally rename the 'probe_out' goto label's name to the more descriptive 'disable_device'. Signed-off-by: Johannes Thumshirn Fixes: e315cd28b9ef ("[SCSI] qla2xxx: Code changes for qla data structure refactoring") Reviewed-by: Bart Van Assche Reviewed-by: Giridhar Malavali Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_os.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index f9b52a4b8c55..94630d4738e6 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -2420,10 +2420,10 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) if (mem_only) { if (pci_enable_device_mem(pdev)) - goto probe_out; + return ret; } else { if (pci_enable_device(pdev)) - goto probe_out; + return ret; } /* This may fail but that's ok */ @@ -2433,7 +2433,7 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) if (!ha) { ql_log_pci(ql_log_fatal, pdev, 0x0009, "Unable to allocate memory for ha.\n"); - goto probe_out; + goto disable_device; } ql_dbg_pci(ql_dbg_init, pdev, 0x000a, "Memory allocated for ha=%p.\n", ha); @@ -3039,7 +3039,7 @@ iospace_config_failed: kfree(ha); ha = NULL; -probe_out: +disable_device: pci_disable_device(pdev); return ret; } From ab2b484e80388726a1810f9525a078427f5da6d0 Mon Sep 17 00:00:00 2001 From: Joe Carnuccio Date: Wed, 24 May 2017 18:06:21 -0700 Subject: [PATCH 197/235] scsi: qla2xxx: Modify T262 FW dump template to specify same start/end to debug customer issues commit ce6c668b146cc4f4442111e2bcee4c3af94e1ddf upstream. Firmware dump allows for debugging customer issues. This patch fixes start/end pointer calculation to capture T262 template entry for dump tool. Signed-off-by: Joe Carnuccio Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_tmpl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_tmpl.c b/drivers/scsi/qla2xxx/qla_tmpl.c index 36935c9ed669..9c2c7fe61280 100644 --- a/drivers/scsi/qla2xxx/qla_tmpl.c +++ b/drivers/scsi/qla2xxx/qla_tmpl.c @@ -371,7 +371,7 @@ qla27xx_fwdt_entry_t262(struct scsi_qla_host *vha, goto done; } - if (end <= start || start == 0 || end == 0) { + if (end < start || start == 0 || end == 0) { ql_dbg(ql_dbg_misc, vha, 0xd023, "%s: unusable range (start=%x end=%x)\n", __func__, ent->t262.end_addr, ent->t262.start_addr); From 64dc431432ec43c9b5da931dbcd2e55754988ea9 Mon Sep 17 00:00:00 2001 From: Joe Carnuccio Date: Wed, 24 May 2017 18:06:22 -0700 Subject: [PATCH 198/235] scsi: qla2xxx: Set bit 15 for DIAG_ECHO_TEST MBC commit 1d63496516c61e2e1351f10e6becbfc9ee511395 upstream. Set bit (BIT_15) to send right ECHO payload information for Diagnostic Echo Test command. Signed-off-by: Joe Carnuccio Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_bsg.c | 9 +++++---- drivers/scsi/qla2xxx/qla_mbx.c | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c index 643014f82f7d..4a6e086279f9 100644 --- a/drivers/scsi/qla2xxx/qla_bsg.c +++ b/drivers/scsi/qla2xxx/qla_bsg.c @@ -721,6 +721,8 @@ qla2x00_process_loopback(struct fc_bsg_job *bsg_job) return -EIO; } + memset(&elreq, 0, sizeof(elreq)); + elreq.req_sg_cnt = dma_map_sg(&ha->pdev->dev, bsg_job->request_payload.sg_list, bsg_job->request_payload.sg_cnt, DMA_TO_DEVICE); @@ -786,10 +788,9 @@ qla2x00_process_loopback(struct fc_bsg_job *bsg_job) if (atomic_read(&vha->loop_state) == LOOP_READY && (ha->current_topology == ISP_CFG_F || - ((IS_QLA81XX(ha) || IS_QLA8031(ha) || IS_QLA8044(ha)) && - le32_to_cpu(*(uint32_t *)req_data) == ELS_OPCODE_BYTE - && req_data_len == MAX_ELS_FRAME_PAYLOAD)) && - elreq.options == EXTERNAL_LOOPBACK) { + (le32_to_cpu(*(uint32_t *)req_data) == ELS_OPCODE_BYTE && + req_data_len == MAX_ELS_FRAME_PAYLOAD)) && + elreq.options == EXTERNAL_LOOPBACK) { type = "FC_BSG_HST_VENDOR_ECHO_DIAG"; ql_dbg(ql_dbg_user, vha, 0x701e, "BSG request type: %s.\n", type); diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 23698c998699..a1b01d66c9ab 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -4783,9 +4783,9 @@ qla2x00_echo_test(scsi_qla_host_t *vha, struct msg_echo_lb *mreq, memset(mcp->mb, 0 , sizeof(mcp->mb)); mcp->mb[0] = MBC_DIAGNOSTIC_ECHO; - mcp->mb[1] = mreq->options | BIT_6; /* BIT_6 specifies 64bit address */ + /* BIT_6 specifies 64bit address */ + mcp->mb[1] = mreq->options | BIT_15 | BIT_6; if (IS_CNA_CAPABLE(ha)) { - mcp->mb[1] |= BIT_15; mcp->mb[2] = vha->fcoe_fcf_idx; } mcp->mb[16] = LSW(mreq->rcv_dma); From 59d9a40b5839d98e324cc78aab89d74bc4c7276f Mon Sep 17 00:00:00 2001 From: Joe Carnuccio Date: Wed, 24 May 2017 18:06:23 -0700 Subject: [PATCH 199/235] scsi: qla2xxx: Fix mailbox pointer error in fwdump capture commit 74939a0bc772d642b1c12827966c4c3a3c90ea2c upstream. Signed-off-by: Joe Carnuccio Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_dbg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_dbg.c b/drivers/scsi/qla2xxx/qla_dbg.c index 45af34ddc432..658e4d15cb71 100644 --- a/drivers/scsi/qla2xxx/qla_dbg.c +++ b/drivers/scsi/qla2xxx/qla_dbg.c @@ -1131,7 +1131,7 @@ qla24xx_fw_dump(scsi_qla_host_t *vha, int hardware_locked) /* Mailbox registers. */ mbx_reg = ®->mailbox0; - for (cnt = 0; cnt < sizeof(fw->mailbox_reg) / 2; cnt++, dmp_reg++) + for (cnt = 0; cnt < sizeof(fw->mailbox_reg) / 2; cnt++, mbx_reg++) fw->mailbox_reg[cnt] = htons(RD_REG_WORD(mbx_reg)); /* Transfer sequence registers. */ @@ -2090,7 +2090,7 @@ qla83xx_fw_dump(scsi_qla_host_t *vha, int hardware_locked) /* Mailbox registers. */ mbx_reg = ®->mailbox0; - for (cnt = 0; cnt < sizeof(fw->mailbox_reg) / 2; cnt++, dmp_reg++) + for (cnt = 0; cnt < sizeof(fw->mailbox_reg) / 2; cnt++, mbx_reg++) fw->mailbox_reg[cnt] = htons(RD_REG_WORD(mbx_reg)); /* Transfer sequence registers. */ From bb0a300f18cef240373fc2820e2114331c5e9773 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 24 May 2017 10:01:55 +0200 Subject: [PATCH 200/235] powerpc/sysdev/simple_gpio: Fix oops in gpio save_regs function commit 6f553912eedafae13ff20b322a65e471fe7f5236 upstream. of_mm_gpiochip_add_data() generates an oops for NULL pointer dereference. of_mm_gpiochip_add_data() calls mm_gc->save_regs() before setting the data, therefore ->save_regs() cannot use gpiochip_get_data() Fixes: 937daafca774 ("powerpc: simple-gpio: use gpiochip data pointer") Signed-off-by: Christophe Leroy Reviewed-by: Linus Walleij Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/sysdev/simple_gpio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/simple_gpio.c b/arch/powerpc/sysdev/simple_gpio.c index ef470b470b04..6afddae2fb47 100644 --- a/arch/powerpc/sysdev/simple_gpio.c +++ b/arch/powerpc/sysdev/simple_gpio.c @@ -75,7 +75,8 @@ static int u8_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static void u8_gpio_save_regs(struct of_mm_gpio_chip *mm_gc) { - struct u8_gpio_chip *u8_gc = gpiochip_get_data(&mm_gc->gc); + struct u8_gpio_chip *u8_gc = + container_of(mm_gc, struct u8_gpio_chip, mm_gc); u8_gc->data = in_8(mm_gc->regs); } From b4624ff952ec7d268a9651cd9184a1995befc271 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 6 Jun 2017 20:23:57 +1000 Subject: [PATCH 201/235] powerpc/numa: Fix percpu allocations to be NUMA aware commit ba4a648f12f4cd0a8003dd229b6ca8a53348ee4b upstream. In commit 8c272261194d ("powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID"), we switched to the generic implementation of cpu_to_node(), which uses a percpu variable to hold the NUMA node for each CPU. Unfortunately we neglected to notice that we use cpu_to_node() in the allocation of our percpu areas, leading to a chicken and egg problem. In practice what happens is when we are setting up the percpu areas, cpu_to_node() reports that all CPUs are on node 0, so we allocate all percpu areas on node 0. This is visible in the dmesg output, as all pcpu allocs being in group 0: pcpu-alloc: [0] 00 01 02 03 [0] 04 05 06 07 pcpu-alloc: [0] 08 09 10 11 [0] 12 13 14 15 pcpu-alloc: [0] 16 17 18 19 [0] 20 21 22 23 pcpu-alloc: [0] 24 25 26 27 [0] 28 29 30 31 pcpu-alloc: [0] 32 33 34 35 [0] 36 37 38 39 pcpu-alloc: [0] 40 41 42 43 [0] 44 45 46 47 To fix it we need an early_cpu_to_node() which can run prior to percpu being setup. We already have the numa_cpu_lookup_table we can use, so just plumb it in. With the patch dmesg output shows two groups, 0 and 1: pcpu-alloc: [0] 00 01 02 03 [0] 04 05 06 07 pcpu-alloc: [0] 08 09 10 11 [0] 12 13 14 15 pcpu-alloc: [0] 16 17 18 19 [0] 20 21 22 23 pcpu-alloc: [1] 24 25 26 27 [1] 28 29 30 31 pcpu-alloc: [1] 32 33 34 35 [1] 36 37 38 39 pcpu-alloc: [1] 40 41 42 43 [1] 44 45 46 47 We can also check the data_offset in the paca of various CPUs, with the fix we see: CPU 0: data_offset = 0x0ffe8b0000 CPU 24: data_offset = 0x1ffe5b0000 And we can see from dmesg that CPU 24 has an allocation on node 1: node 0: [mem 0x0000000000000000-0x0000000fffffffff] node 1: [mem 0x0000001000000000-0x0000001fffffffff] Fixes: 8c272261194d ("powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID") Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/topology.h | 14 ++++++++++++++ arch/powerpc/kernel/setup_64.c | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 8b3b46b7b0f2..329771559cbb 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -44,8 +44,22 @@ extern void __init dump_numa_cpu_topology(void); extern int sysfs_add_device_to_node(struct device *dev, int nid); extern void sysfs_remove_device_from_node(struct device *dev, int nid); +static inline int early_cpu_to_node(int cpu) +{ + int nid; + + nid = numa_cpu_lookup_table[cpu]; + + /* + * Fall back to node 0 if nid is unset (it should be, except bugs). + * This allows callers to safely do NODE_DATA(early_cpu_to_node(cpu)). + */ + return (nid < 0) ? 0 : nid; +} #else +static inline int early_cpu_to_node(int cpu) { return 0; } + static inline void dump_numa_cpu_topology(void) {} static inline int sysfs_add_device_to_node(struct device *dev, int nid) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a12be60181bf..ada71bee176d 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -595,7 +595,7 @@ void __init emergency_stack_init(void) static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return __alloc_bootmem_node(NODE_DATA(cpu_to_node(cpu)), size, align, + return __alloc_bootmem_node(NODE_DATA(early_cpu_to_node(cpu)), size, align, __pa(MAX_DMA_ADDRESS)); } @@ -606,7 +606,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size) static int pcpu_cpu_distance(unsigned int from, unsigned int to) { - if (cpu_to_node(from) == cpu_to_node(to)) + if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else return REMOTE_DISTANCE; From cbf687acc1e132bece665bc284d9abd32f1bdc11 Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Mon, 22 May 2017 15:44:37 -0500 Subject: [PATCH 202/235] powerpc/hotplug-mem: Fix missing endian conversion of aa_index commit dc421b200f91930c9c6a9586810ff8c232cf10fc upstream. When adding or removing memory, the aa_index (affinity value) for the memblock must also be converted to match the endianness of the rest of the 'ibm,dynamic-memory' property. Otherwise, subsequent retrieval of the attribute will likely lead to non-existent nodes, followed by using the default node in the code inappropriately. Fixes: 5f97b2a0d176 ("powerpc/pseries: Implement memory hotplug add in the kernel") Signed-off-by: Michael Bringmann Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/hotplug-memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 76ec104e88be..c0a0947f43bb 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -124,6 +124,7 @@ static struct property *dlpar_clone_drconf_property(struct device_node *dn) for (i = 0; i < num_lmbs; i++) { lmbs[i].base_addr = be64_to_cpu(lmbs[i].base_addr); lmbs[i].drc_index = be32_to_cpu(lmbs[i].drc_index); + lmbs[i].aa_index = be32_to_cpu(lmbs[i].aa_index); lmbs[i].flags = be32_to_cpu(lmbs[i].flags); } @@ -147,6 +148,7 @@ static void dlpar_update_drconf_property(struct device_node *dn, for (i = 0; i < num_lmbs; i++) { lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); + lmbs[i].aa_index = cpu_to_be32(lmbs[i].aa_index); lmbs[i].flags = cpu_to_be32(lmbs[i].flags); } From 2cfdf4fd3292d05eeca5d59307bf231b94df1d4b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 2 Jun 2017 18:43:30 -0300 Subject: [PATCH 203/235] powerpc/kernel: Fix FP and vector register restoration commit 1195892c091a15cc862f4e202482a36adc924e12 upstream. Currently tsk->thread->load_vec and load_fp are not initialized during task creation, which can lead to garbage values in these variables (non-zero values). These variables will be checked later in restore_math() to validate if the FP and vector registers are being utilized. Since these values might be non-zero, the restore_math() will continue to save the FP and vectors even if they were never utilized by the userspace application. load_fp and load_vec counters will then overflow (they wrap at 255) and the FP and Altivec will be finally disabled, but before that condition is reached (counter overflow) several context switches will have restored FP and vector registers without need, causing a performance degradation. Fixes: 70fe3d980f5f ("powerpc: Restore FPU/VEC/VSX if previously used") Signed-off-by: Breno Leitao Signed-off-by: Gustavo Romero Acked-by: Anton Blanchard Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index c7164739dc75..dc46ca1950c2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1659,6 +1659,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif + current->thread.load_fp = 0; memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; #ifdef CONFIG_ALTIVEC @@ -1667,6 +1668,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) current->thread.vr_save_area = NULL; current->thread.vrsave = 0; current->thread.used_vr = 0; + current->thread.load_vec = 0; #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_SPE memset(current->thread.evr, 0, sizeof(current->thread.evr)); From 6e6d89e18e53062987b2d14223f34abf64046ab7 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 5 Jun 2017 11:40:59 -0300 Subject: [PATCH 204/235] powerpc/kernel: Initialize load_tm on task creation commit 7f22ced4377628074e2ac25f41a88f98eb3b03f1 upstream. Currently tsk->thread.load_tm is not initialized in the task creation and can contain garbage on a new task. This is an undesired behaviour, since it affects the timing to enable and disable the transactional memory laziness (disabling and enabling the MSR TM bit, which affects TM reclaim and recheckpoint in the scheduling process). Fixes: 5d176f751ee3 ("powerpc: tm: Enable transactional memory (TM) lazily for userspace") Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/process.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index dc46ca1950c2..b249c2fb99c8 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1680,6 +1680,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) current->thread.tm_tfhar = 0; current->thread.tm_texasr = 0; current->thread.tm_tfiar = 0; + current->thread.load_tm = 0; #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ } EXPORT_SYMBOL(start_thread); From 3743c0e1276d73351e5764a0b6ef006039b65235 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Thu, 25 May 2017 18:09:07 +0800 Subject: [PATCH 205/235] perf/core: Drop kernel samples even though :u is specified commit cc1582c231ea041fbc68861dfaf957eaf902b829 upstream. When doing sampling, for example: perf record -e cycles:u ... On workloads that do a lot of kernel entry/exits we see kernel samples, even though :u is specified. This is due to skid existing. This might be a security issue because it can leak kernel addresses even though kernel sampling support is disabled. The patch drops the kernel samples if exclude_kernel is specified. For example, test on Haswell desktop: perf record -e cycles:u perf report --stdio Before patch applied: 99.77% mgen mgen [.] buf_read 0.20% mgen mgen [.] rand_buf_init 0.01% mgen [kernel.vmlinux] [k] apic_timer_interrupt 0.00% mgen mgen [.] last_free_elem 0.00% mgen libc-2.23.so [.] __random_r 0.00% mgen libc-2.23.so [.] _int_malloc 0.00% mgen mgen [.] rand_array_init 0.00% mgen [kernel.vmlinux] [k] page_fault 0.00% mgen libc-2.23.so [.] __random 0.00% mgen libc-2.23.so [.] __strcasestr 0.00% mgen ld-2.23.so [.] strcmp 0.00% mgen ld-2.23.so [.] _dl_start 0.00% mgen libc-2.23.so [.] sched_setaffinity@@GLIBC_2.3.4 0.00% mgen ld-2.23.so [.] _start We can see kernel symbols apic_timer_interrupt and page_fault. After patch applied: 99.79% mgen mgen [.] buf_read 0.19% mgen mgen [.] rand_buf_init 0.00% mgen libc-2.23.so [.] __random_r 0.00% mgen mgen [.] rand_array_init 0.00% mgen mgen [.] last_free_elem 0.00% mgen libc-2.23.so [.] vfprintf 0.00% mgen libc-2.23.so [.] rand 0.00% mgen libc-2.23.so [.] __random 0.00% mgen libc-2.23.so [.] _int_malloc 0.00% mgen libc-2.23.so [.] _IO_doallocbuf 0.00% mgen ld-2.23.so [.] do_lookup_x 0.00% mgen ld-2.23.so [.] open_verify.constprop.7 0.00% mgen ld-2.23.so [.] _dl_important_hwcaps 0.00% mgen libc-2.23.so [.] sched_setaffinity@@GLIBC_2.3.4 0.00% mgen ld-2.23.so [.] _start There are only userspace symbols. Signed-off-by: Jin Yao Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Cc: kan.liang@intel.com Cc: mark.rutland@arm.com Cc: will.deacon@arm.com Cc: yao.jin@intel.com Link: http://lkml.kernel.org/r/1495706947-3744-1-git-send-email-yao.jin@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 07c0dc806dfc..11cc1d83c770 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7062,6 +7062,21 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + * To avoid leaking information to userspace, we must always + * reject kernel samples when exclude_kernel is set. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ @@ -7108,6 +7123,12 @@ static int __perf_event_overflow(struct perf_event *event, perf_adjust_period(event, delta, hwc->last_period, true); } + /* + * For security, drop the skid kernel samples if necessary. + */ + if (!sample_is_allowed(event, regs)) + return ret; + /* * XXX event_limit might not quite work as expected on inherited * events From 64c21af51d719b41b8c43203fed650f3de4fc4bf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 27 Apr 2017 12:12:08 +0300 Subject: [PATCH 206/235] drm/vmwgfx: Handle vmalloc() failure in vmw_local_fifo_reserve() commit f0c62e9878024300319ba2438adc7b06c6b9c448 upstream. If vmalloc() fails then we need to a bit of cleanup before returning. Fixes: fb1d9738ca05 ("drm/vmwgfx: Add DRM driver for VMware Virtual GPU") Signed-off-by: Dan Carpenter Reviewed-by: Sinclair Yeh Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c index b6a0806b06bf..a1c68e6a689e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c @@ -368,6 +368,8 @@ static void *vmw_local_fifo_reserve(struct vmw_private *dev_priv, return fifo_state->static_buffer; else { fifo_state->dynamic_buffer = vmalloc(bytes); + if (!fifo_state->dynamic_buffer) + goto out_err; return fifo_state->dynamic_buffer; } } From a76ff847013a7f6b1cd328381ca263ddcca12061 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Fri, 2 Jun 2017 07:42:09 +0200 Subject: [PATCH 207/235] drm/vmwgfx: limit the number of mip levels in vmw_gb_surface_define_ioctl() commit ee9c4e681ec4f58e42a83cb0c22a0289ade1aacf upstream. The 'req->mip_levels' parameter in vmw_gb_surface_define_ioctl() is a user-controlled 'uint32_t' value which is used as a loop count limit. This can lead to a kernel lockup and DoS. Add check for 'req->mip_levels'. References: https://bugzilla.redhat.com/show_bug.cgi?id=1437431 Signed-off-by: Vladis Dronov Reviewed-by: Sinclair Yeh Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index 05fa092c942b..6fed5a8da005 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -1280,6 +1280,9 @@ int vmw_gb_surface_define_ioctl(struct drm_device *dev, void *data, if (req->multisample_count != 0) return -EINVAL; + if (req->mip_levels > DRM_VMW_MAX_MIP_LEVELS) + return -EINVAL; + if (unlikely(vmw_user_surface_size == 0)) vmw_user_surface_size = ttm_round_pot(sizeof(*user_srf)) + 128; From 7860d0e5e2bf986d4bd06e7b029786747b5dc766 Mon Sep 17 00:00:00 2001 From: Sinclair Yeh Date: Fri, 2 Jun 2017 07:50:57 +0200 Subject: [PATCH 208/235] drm/vmwgfx: Make sure backup_handle is always valid commit 07678eca2cf9c9a18584e546c2b2a0d0c9a3150c upstream. When vmw_gb_surface_define_ioctl() is called with an existing buffer, we end up returning an uninitialized variable in the backup_handle. The fix is to first initialize backup_handle to 0 just to be sure, and second, when a user-provided buffer is found, we will use the req->buffer_handle as the backup_handle. Reported-by: Murray McAllister Signed-off-by: Sinclair Yeh Reviewed-by: Deepak Rawat Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index 6fed5a8da005..56b803384ea2 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -1275,7 +1275,7 @@ int vmw_gb_surface_define_ioctl(struct drm_device *dev, void *data, struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; int ret; uint32_t size; - uint32_t backup_handle; + uint32_t backup_handle = 0; if (req->multisample_count != 0) return -EINVAL; @@ -1318,12 +1318,16 @@ int vmw_gb_surface_define_ioctl(struct drm_device *dev, void *data, ret = vmw_user_dmabuf_lookup(tfile, req->buffer_handle, &res->backup, &user_srf->backup_base); - if (ret == 0 && res->backup->base.num_pages * PAGE_SIZE < - res->backup_size) { - DRM_ERROR("Surface backup buffer is too small.\n"); - vmw_dmabuf_unreference(&res->backup); - ret = -EINVAL; - goto out_unlock; + if (ret == 0) { + if (res->backup->base.num_pages * PAGE_SIZE < + res->backup_size) { + DRM_ERROR("Surface backup buffer is too small.\n"); + vmw_dmabuf_unreference(&res->backup); + ret = -EINVAL; + goto out_unlock; + } else { + backup_handle = req->buffer_handle; + } } } else if (req->drm_surface_flags & drm_vmw_surface_flag_create_buffer) ret = vmw_user_dmabuf_alloc(dev_priv, tfile, From aae14f569f5dbd13f9a7f34db8110e1658ef2c48 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Mon, 5 Jun 2017 17:23:32 +1000 Subject: [PATCH 209/235] drm/nouveau/tmr: fully separate alarm execution/pending lists commit b4e382ca7586a63b6c1e5221ce0863ff867c2df6 upstream. Reusing the list_head for both is a bad idea. Callback execution is done with the lock dropped so that alarms can be rescheduled from the callback, which means that with some unfortunate timing, lists can get corrupted. The execution list should not require its own locking, the single function that uses it can only be called from a single context. Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/include/nvkm/subdev/timer.h | 1 + drivers/gpu/drm/nouveau/nvkm/subdev/timer/base.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/timer.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/timer.h index 82d3e28918fd..7e4f24ae7de8 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/timer.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/timer.h @@ -4,6 +4,7 @@ struct nvkm_alarm { struct list_head head; + struct list_head exec; u64 timestamp; void (*func)(struct nvkm_alarm *); }; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/timer/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/timer/base.c index f2a86eae0a0d..2437f7d41ca2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/timer/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/timer/base.c @@ -50,7 +50,8 @@ nvkm_timer_alarm_trigger(struct nvkm_timer *tmr) /* Move to completed list. We'll drop the lock before * executing the callback so it can reschedule itself. */ - list_move_tail(&alarm->head, &exec); + list_del_init(&alarm->head); + list_add(&alarm->exec, &exec); } /* Shut down interrupt if no more pending alarms. */ @@ -59,8 +60,8 @@ nvkm_timer_alarm_trigger(struct nvkm_timer *tmr) spin_unlock_irqrestore(&tmr->lock, flags); /* Execute completed callbacks. */ - list_for_each_entry_safe(alarm, atemp, &exec, head) { - list_del_init(&alarm->head); + list_for_each_entry_safe(alarm, atemp, &exec, exec) { + list_del(&alarm->exec); alarm->func(alarm); } } From 66e982d8f1a1f5e151377fe37612e9151e552dc9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 2 Jun 2017 15:03:38 +0200 Subject: [PATCH 210/235] ALSA: timer: Fix race between read and ioctl commit d11662f4f798b50d8c8743f433842c3e40fe3378 upstream. The read from ALSA timer device, the function snd_timer_user_tread(), may access to an uninitialized struct snd_timer_user fields when the read is concurrently performed while the ioctl like snd_timer_user_tselect() is invoked. We have already fixed the races among ioctls via a mutex, but we seem to have forgotten the race between read vs ioctl. This patch simply applies (more exactly extends the already applied range of) tu->ioctl_lock in snd_timer_user_tread() for closing the race window. Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index ad153149b231..3c11a6983f54 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1963,6 +1963,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, tu = file->private_data; unit = tu->tread ? sizeof(struct snd_timer_tread) : sizeof(struct snd_timer_read); + mutex_lock(&tu->ioctl_lock); spin_lock_irq(&tu->qlock); while ((long)count - result >= unit) { while (!tu->qused) { @@ -1978,7 +1979,9 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, add_wait_queue(&tu->qchange_sleep, &wait); spin_unlock_irq(&tu->qlock); + mutex_unlock(&tu->ioctl_lock); schedule(); + mutex_lock(&tu->ioctl_lock); spin_lock_irq(&tu->qlock); remove_wait_queue(&tu->qchange_sleep, &wait); @@ -1998,7 +2001,6 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, tu->qused--; spin_unlock_irq(&tu->qlock); - mutex_lock(&tu->ioctl_lock); if (tu->tread) { if (copy_to_user(buffer, &tu->tqueue[qhead], sizeof(struct snd_timer_tread))) @@ -2008,7 +2010,6 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, sizeof(struct snd_timer_read))) err = -EFAULT; } - mutex_unlock(&tu->ioctl_lock); spin_lock_irq(&tu->qlock); if (err < 0) @@ -2018,6 +2019,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, } _error: spin_unlock_irq(&tu->qlock); + mutex_unlock(&tu->ioctl_lock); return result > 0 ? result : err; } From 82ecd2f054bd778eeb97c775c915b052e2941c79 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 2 Jun 2017 17:26:56 +0200 Subject: [PATCH 211/235] ALSA: timer: Fix missing queue indices reset at SNDRV_TIMER_IOCTL_SELECT commit ba3021b2c79b2fa9114f92790a99deb27a65b728 upstream. snd_timer_user_tselect() reallocates the queue buffer dynamically, but it forgot to reset its indices. Since the read may happen concurrently with ioctl and snd_timer_user_tselect() allocates the buffer via kmalloc(), this may lead to the leak of uninitialized kernel-space data, as spotted via KMSAN: BUG: KMSAN: use of unitialized memory in snd_timer_user_read+0x6c4/0xa10 CPU: 0 PID: 1037 Comm: probe Not tainted 4.11.0-rc5+ #2739 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x143/0x1b0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:1007 kmsan_check_memory+0xc2/0x140 mm/kmsan/kmsan.c:1086 copy_to_user ./arch/x86/include/asm/uaccess.h:725 snd_timer_user_read+0x6c4/0xa10 sound/core/timer.c:2004 do_loop_readv_writev fs/read_write.c:716 __do_readv_writev+0x94c/0x1380 fs/read_write.c:864 do_readv_writev fs/read_write.c:894 vfs_readv fs/read_write.c:908 do_readv+0x52a/0x5d0 fs/read_write.c:934 SYSC_readv+0xb6/0xd0 fs/read_write.c:1021 SyS_readv+0x87/0xb0 fs/read_write.c:1018 This patch adds the missing reset of queue indices. Together with the previous fix for the ioctl/read race, we cover the whole problem. Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 3c11a6983f54..e5ddc475dca4 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1622,6 +1622,7 @@ static int snd_timer_user_tselect(struct file *file, if (err < 0) goto __err; + tu->qhead = tu->qtail = tu->qused = 0; kfree(tu->queue); tu->queue = NULL; kfree(tu->tqueue); From eb8fa317cb0184f5179e29bab8c62a4a4087cd4a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 24 May 2017 10:19:45 +0200 Subject: [PATCH 212/235] ASoC: Fix use-after-free at card unregistration commit 4efda5f2130da033aeedc5b3205569893b910de2 upstream. soc_cleanup_card_resources() call snd_card_free() at the last of its procedure. This turned out to lead to a use-after-free. PCM runtimes have been already removed via soc_remove_pcm_runtimes(), while it's dereferenced later in soc_pcm_free() called via snd_card_free(). The fix is simple: just move the snd_card_free() call to the beginning of the whole procedure. This also gives another benefit: it guarantees that all operations have been shut down before actually releasing the resources, which was racy until now. Reported-and-tested-by: Robert Jarzmik Signed-off-by: Takashi Iwai Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index c0bbcd903261..4e3de566809c 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -2076,6 +2076,9 @@ static int soc_cleanup_card_resources(struct snd_soc_card *card) list_for_each_entry(rtd, &card->rtd_list, list) flush_delayed_work(&rtd->delayed_work); + /* free the ALSA card at first; this syncs with pending operations */ + snd_card_free(card->snd_card); + /* remove and free each DAI */ soc_remove_dai_links(card); soc_remove_pcm_runtimes(card); @@ -2090,9 +2093,7 @@ static int soc_cleanup_card_resources(struct snd_soc_card *card) if (card->remove) card->remove(card); - snd_card_free(card->snd_card); return 0; - } /* removes a socdev */ From 106c77e82572921fa53483235a55adb7cb9452c0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 2 Jun 2017 16:27:14 +0200 Subject: [PATCH 213/235] cpu/hotplug: Drop the device lock on error commit 40da1b11f01e43aad1aa6cea64681b6125e8a2a7 upstream. If a custom CPU target is specified and that one is not available _or_ can't be interrupted then the code returns to userland without dropping a lock as notices by lockdep: |echo 133 > /sys/devices/system/cpu/cpu7/hotplug/target | ================================================ | [ BUG: lock held when returning to user space! ] | ------------------------------------------------ | bash/503 is leaving the kernel with locks still held! | 1 lock held by bash/503: | #0: (device_hotplug_lock){+.+...}, at: [] lock_device_hotplug_sysfs+0x10/0x40 So release the lock then. Fixes: 757c989b9994 ("cpu/hotplug: Make target state writeable") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20170602142714.3ogo25f2wbq6fjpj@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 99c6c568bc55..8f52977aad59 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1765,13 +1765,13 @@ static ssize_t write_cpuhp_target(struct device *dev, ret = !sp->name || sp->cant_stop ? -EINVAL : 0; mutex_unlock(&cpuhp_state_mutex); if (ret) - return ret; + goto out; if (st->state < target) ret = do_cpu_up(dev->id, target); else ret = do_cpu_down(dev->id, target); - +out: unlock_device_hotplug(); return ret ? ret : count; } From 9ff4a1a36a587b0343a02518dbc1d5bf4c0c99e8 Mon Sep 17 00:00:00 2001 From: Julius Werner Date: Fri, 2 Jun 2017 15:36:39 -0700 Subject: [PATCH 214/235] drivers: char: mem: Fix wraparound check to allow mappings up to the end commit 32829da54d9368103a2f03269a5120aa9ee4d5da upstream. A recent fix to /dev/mem prevents mappings from wrapping around the end of physical address space. However, the check was written in a way that also prevents a mapping reaching just up to the end of physical address space, which may be a valid use case (especially on 32-bit systems). This patch fixes it by checking the last mapped address (instead of the first address behind that) for overflow. Fixes: b299cde245 ("drivers: char: mem: Check for address space wraparound with mmap()") Reported-by: Nico Huber Signed-off-by: Julius Werner Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 6e0cbe092220..593a8818aca9 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -343,7 +343,7 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma) phys_addr_t offset = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; /* It's illegal to wrap around the end of the physical address space. */ - if (offset + (phys_addr_t)size < offset) + if (offset + (phys_addr_t)size - 1 < offset) return -EINVAL; if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size)) From d9520248733414e96fb534b99534ef1e761eda6e Mon Sep 17 00:00:00 2001 From: Takatoshi Akiyama Date: Mon, 27 Feb 2017 15:56:31 +0900 Subject: [PATCH 215/235] serial: sh-sci: Fix panic when serial console and DMA are enabled commit 3c9101766b502a0163d1d437fada5801cf616be2 upstream. This patch fixes an issue that kernel panic happens when DMA is enabled and we press enter key while the kernel booting on the serial console. * An interrupt may occur after sci_request_irq(). * DMA transfer area is initialized by setup_timer() in sci_request_dma() and used in interrupt. If an interrupt occurred between sci_request_irq() and setup_timer() in sci_request_dma(), DMA transfer area has not been initialized yet. So, this patch changes the order of sci_request_irq() and sci_request_dma(). Fixes: 73a19e4c0301 ("serial: sh-sci: Add DMA support.") Signed-off-by: Takatoshi Akiyama [Shimoda changes the commit log] Signed-off-by: Yoshihiro Shimoda Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sh-sci.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 4b26252c2885..ee84f89391ca 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -1976,12 +1976,14 @@ static int sci_startup(struct uart_port *port) dev_dbg(port->dev, "%s(%d)\n", __func__, port->line); - ret = sci_request_irq(s); - if (unlikely(ret < 0)) - return ret; - sci_request_dma(port); + ret = sci_request_irq(s); + if (unlikely(ret < 0)) { + sci_free_dma(port); + return ret; + } + return 0; } @@ -2012,8 +2014,8 @@ static void sci_shutdown(struct uart_port *port) } #endif - sci_free_dma(port); sci_free_irq(s); + sci_free_dma(port); } static int sci_sck_calc(struct sci_port *s, unsigned int bps, From 791d94ef40f6b9c2e809bd118d1b3423e7514d37 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Tue, 6 Jun 2017 20:14:08 +0100 Subject: [PATCH 216/235] arm64: traps: fix userspace cache maintenance emulation on a tagged pointer commit 81cddd65b5c82758ea5571a25e31ff6f1f89ff02 upstream. This backport has a minor difference from the upstream commit, as v4.9 did not yet have the refactoring done by commit 8b6e70fccff2 ("arm64: traps: correctly handle MRS/MSR with XZR"). Original patch description: When we emulate userspace cache maintenance in the kernel, we can currently send the task a SIGSEGV even though the maintenance was done on a valid address. This happens if the address has a non-zero address tag, and happens to not be mapped in. When we get the address from a user register, we don't currently remove the address tag before performing cache maintenance on it. If the maintenance faults, we end up in either __do_page_fault, where find_vma can't find the VMA if the address has a tag, or in do_translation_fault, where the tagged address will appear to be above TASK_SIZE. In both cases, the address is not mapped in, and the task is sent a SIGSEGV. This patch removes the tag from the address before using it. With this patch, the fault is handled correctly, the address gets mapped in, and the cache maintenance succeeds. As a second bug, if cache maintenance (correctly) fails on an invalid tagged address, the address gets passed into arm64_notify_segfault, where find_vma fails to find the VMA due to the tag, and the wrong si_code may be sent as part of the siginfo_t of the segfault. With this patch, the correct si_code is sent. Fixes: 7dd01aef0557 ("arm64: trap userspace "dc cvau" cache operation on errata-affected core") Acked-by: Will Deacon Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 11e5eae088ab..f22826135c73 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -435,7 +435,7 @@ int cpu_enable_cache_maint_trap(void *__unused) } #define __user_cache_maint(insn, address, res) \ - if (untagged_addr(address) >= user_addr_max()) \ + if (address >= user_addr_max()) \ res = -EFAULT; \ else \ asm volatile ( \ @@ -458,7 +458,7 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; - address = (rt == 31) ? 0 : regs->regs[rt]; + address = (rt == 31) ? 0 : untagged_addr(regs->regs[rt]); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ From 1d61ccb5ac272b6cddc50c542646d2b35f87de8d Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Tue, 6 Jun 2017 20:14:09 +0100 Subject: [PATCH 217/235] arm64: hw_breakpoint: fix watchpoint matching for tagged pointers commit 7dcd9dd8cebe9fa626af7e2358d03a37041a70fb upstream. This backport has a small difference from the upstream commit: - The address tag is removed in watchpoint_handler() instead of get_distance_from_watchpoint(), because 4.9 does not have commit fdfeff0f9e3d ("arm64: hw_breakpoint: Handle inexact watchpoint addresses"). Original patch description: When we take a watchpoint exception, the address that triggered the watchpoint is found in FAR_EL1. We compare it to the address of each configured watchpoint to see which one was hit. The configured watchpoint addresses are untagged, while the address in FAR_EL1 will have an address tag if the data access was done using a tagged address. The tag needs to be removed to compare the address to the watchpoints. Currently we don't remove it, and as a result can report the wrong watchpoint as being hit (specifically, always either the highest TTBR0 watchpoint or lowest TTBR1 watchpoint). This patch removes the tag. Fixes: d50240a5f6ce ("arm64: mm: permit use of tagged pointers at EL0") Acked-by: Mark Rutland Acked-by: Will Deacon Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/uaccess.h | 6 +++--- arch/arm64/kernel/hw_breakpoint.c | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 14cca10aeb4e..811cf16a65f9 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -105,9 +105,9 @@ static inline void set_fs(mm_segment_t fs) }) /* - * When dealing with data aborts or instruction traps we may end up with - * a tagged userland pointer. Clear the tag to get a sane pointer to pass - * on to access_ok(), for instance. + * When dealing with data aborts, watchpoints, or instruction traps we may end + * up with a tagged userland pointer. Clear the tag to get a sane pointer to + * pass on to access_ok(), for instance. */ #define untagged_addr(addr) sign_extend64(addr, 55) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 948b73148d56..0b9e5f6290f9 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -36,6 +36,7 @@ #include #include #include +#include /* Breakpoint currently in use for each BRP. */ static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]); @@ -696,7 +697,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, /* Check if the watchpoint value matches. */ val = read_wb_reg(AARCH64_DBG_REG_WVR, i); - if (val != (addr & ~alignment_mask)) + if (val != (untagged_addr(addr) & ~alignment_mask)) goto unlock; /* Possible match, check the byte address select to confirm. */ From 9e09d90ac5ac929ab0f817f81f958dfa817d5bfb Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Tue, 6 Jun 2017 20:14:10 +0100 Subject: [PATCH 218/235] arm64: entry: improve data abort handling of tagged pointers commit 276e93279a630657fff4b086ba14c95955912dfa upstream. This backport has a minor difference from the upstream commit: it adds the asm-uaccess.h file, which is not present in 4.9, because 4.9 does not have commit b4b8664d291a ("arm64: don't pull uaccess.h into *.S"). Original patch description: When handling a data abort from EL0, we currently zero the top byte of the faulting address, as we assume the address is a TTBR0 address, which may contain a non-zero address tag. However, the address may be a TTBR1 address, in which case we should not zero the top byte. This patch fixes that. The effect is that the full TTBR1 address is passed to the task's signal handler (or printed out in the kernel log). When handling a data abort from EL1, we leave the faulting address intact, as we assume it's either a TTBR1 address or a TTBR0 address with tag 0x00. This is true as far as I'm aware, we don't seem to access a tagged TTBR0 address anywhere in the kernel. Regardless, it's easy to forget about address tags, and code added in the future may not always remember to remove tags from addresses before accessing them. So add tag handling to the EL1 data abort handler as well. This also makes it consistent with the EL0 data abort handler. Fixes: d50240a5f6ce ("arm64: mm: permit use of tagged pointers at EL0") Reviewed-by: Dave Martin Acked-by: Will Deacon Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/asm-uaccess.h | 13 +++++++++++++ arch/arm64/kernel/entry.S | 6 ++++-- 2 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/asm-uaccess.h diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h new file mode 100644 index 000000000000..be2d2347d995 --- /dev/null +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -0,0 +1,13 @@ +#ifndef __ASM_ASM_UACCESS_H +#define __ASM_ASM_UACCESS_H + +/* + * Remove the address tag from a virtual address, if present. + */ + .macro clear_address_tag, dst, addr + tst \addr, #(1 << 55) + bic \dst, \addr, #(0xff << 56) + csel \dst, \dst, \addr, eq + .endm + +#endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 79b0fe24d5b7..b4c7db434654 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -30,6 +30,7 @@ #include #include #include +#include #include /* @@ -369,12 +370,13 @@ el1_da: /* * Data abort handling */ - mrs x0, far_el1 + mrs x3, far_el1 enable_dbg // re-enable interrupts if they were enabled in the aborted context tbnz x23, #7, 1f // PSR_I_BIT enable_irq 1: + clear_address_tag x0, x3 mov x2, sp // struct pt_regs bl do_mem_abort @@ -535,7 +537,7 @@ el0_da: // enable interrupts before calling the main handler enable_dbg_and_irq ct_user_exit - bic x0, x26, #(0xff << 56) + clear_address_tag x0, x26 mov x1, x25 mov x2, sp bl do_mem_abort From 1df21f45fd55274128c61908765d65837a30f199 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 13 Jan 2017 22:51:08 +0100 Subject: [PATCH 219/235] ARM: 8636/1: Cleanup sanity_check_meminfo commit 374d446d25d6271ee615952a3b7f123ba4983c35 upstream. The logic for sanity_check_meminfo has become difficult to follow. Clean up the code so it's more obvious what the code is actually trying to do. Additionally, meminfo is now removed so rename the function to better describe its purpose. Tested-by: Magnus Lilja Reviewed-by: Nicolas Pitre Signed-off-by: Laura Abbott Signed-off-by: Laura Abbott Signed-off-by: Russell King Cc: Julien Grall Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/setup.c | 4 +-- arch/arm/mm/mmu.c | 66 +++++++++++++++-------------------------- arch/arm/mm/nommu.c | 8 ++--- 3 files changed, 30 insertions(+), 48 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 34e3f3c45634..8a8051cf57d1 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -81,7 +81,7 @@ __setup("fpe=", fpe_setup); extern void init_default_cache_policy(unsigned long); extern void paging_init(const struct machine_desc *desc); extern void early_paging_init(const struct machine_desc *); -extern void sanity_check_meminfo(void); +extern void adjust_lowmem_bounds(void); extern enum reboot_mode reboot_mode; extern void setup_dma_zone(const struct machine_desc *desc); @@ -1093,7 +1093,7 @@ void __init setup_arch(char **cmdline_p) setup_dma_zone(mdesc); xen_early_init(); efi_init(); - sanity_check_meminfo(); + adjust_lowmem_bounds(); arm_memblock_init(mdesc); early_ioremap_reset(); diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 4001dd15818d..b8f70a3bb7f8 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1152,13 +1152,11 @@ early_param("vmalloc", early_vmalloc); phys_addr_t arm_lowmem_limit __initdata = 0; -void __init sanity_check_meminfo(void) +void __init adjust_lowmem_bounds(void) { phys_addr_t memblock_limit = 0; - int highmem = 0; u64 vmalloc_limit; struct memblock_region *reg; - bool should_use_highmem = false; /* * Let's use our own (unoptimized) equivalent of __pa() that is @@ -1172,43 +1170,18 @@ void __init sanity_check_meminfo(void) for_each_memblock(memory, reg) { phys_addr_t block_start = reg->base; phys_addr_t block_end = reg->base + reg->size; - phys_addr_t size_limit = reg->size; - if (reg->base >= vmalloc_limit) - highmem = 1; - else - size_limit = vmalloc_limit - reg->base; - - - if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) { - - if (highmem) { - pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n", - &block_start, &block_end); - memblock_remove(reg->base, reg->size); - should_use_highmem = true; - continue; - } - - if (reg->size > size_limit) { - phys_addr_t overlap_size = reg->size - size_limit; - - pr_notice("Truncating RAM at %pa-%pa", - &block_start, &block_end); - block_end = vmalloc_limit; - pr_cont(" to -%pa", &block_end); - memblock_remove(vmalloc_limit, overlap_size); - should_use_highmem = true; - } - } - - if (!highmem) { - if (block_end > arm_lowmem_limit) { - if (reg->size > size_limit) - arm_lowmem_limit = vmalloc_limit; - else - arm_lowmem_limit = block_end; - } + if (reg->base < vmalloc_limit) { + if (block_end > arm_lowmem_limit) + /* + * Compare as u64 to ensure vmalloc_limit does + * not get truncated. block_end should always + * fit in phys_addr_t so there should be no + * issue with assignment. + */ + arm_lowmem_limit = min_t(u64, + vmalloc_limit, + block_end); /* * Find the first non-pmd-aligned page, and point @@ -1233,9 +1206,6 @@ void __init sanity_check_meminfo(void) } } - if (should_use_highmem) - pr_notice("Consider using a HIGHMEM enabled kernel.\n"); - high_memory = __va(arm_lowmem_limit - 1) + 1; /* @@ -1248,6 +1218,18 @@ void __init sanity_check_meminfo(void) if (!memblock_limit) memblock_limit = arm_lowmem_limit; + if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) { + if (memblock_end_of_DRAM() > arm_lowmem_limit) { + phys_addr_t end = memblock_end_of_DRAM(); + + pr_notice("Ignoring RAM at %pa-%pa\n", + &memblock_limit, &end); + pr_notice("Consider using a HIGHMEM enabled kernel.\n"); + + memblock_remove(memblock_limit, end - memblock_limit); + } + } + memblock_set_current_limit(memblock_limit); } diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 2740967727e2..13a25d6282f8 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -85,7 +85,7 @@ static unsigned long irbar_read(void) } /* MPU initialisation functions */ -void __init sanity_check_meminfo_mpu(void) +void __init adjust_lowmem_bounds_mpu(void) { phys_addr_t phys_offset = PHYS_OFFSET; phys_addr_t aligned_region_size, specified_mem_size, rounded_mem_size; @@ -274,7 +274,7 @@ void __init mpu_setup(void) } } #else -static void sanity_check_meminfo_mpu(void) {} +static void adjust_lowmem_bounds_mpu(void) {} static void __init mpu_setup(void) {} #endif /* CONFIG_ARM_MPU */ @@ -295,10 +295,10 @@ void __init arm_mm_memblock_reserve(void) #endif } -void __init sanity_check_meminfo(void) +void __init adjust_lowmem_bounds(void) { phys_addr_t end; - sanity_check_meminfo_mpu(); + adjust_lowmem_bounds_mpu(); end = memblock_end_of_DRAM(); high_memory = __va(end - 1) + 1; memblock_set_current_limit(end); From eefa5e13dff94000af79a6ec173376f6eb629bc1 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 13 Jan 2017 22:51:45 +0100 Subject: [PATCH 220/235] ARM: 8637/1: Adjust memory boundaries after reservations commit 985626564eedc470ce2866e53938303368ad41b7 upstream. adjust_lowmem_bounds is responsible for setting up the boundary for lowmem/highmem. This needs to be setup before memblock reservations can occur. At the time memblock reservations can occur, memory can also be removed from the system. The lowmem/highmem boundary and end of memory may be affected by this but it is currently not recalculated. On some systems this may be harmless, on others this may result in incorrect ranges being passed to the main memory allocator. Correct this by recalculating the lowmem/highmem boundary after all reservations have been made. Tested-by: Magnus Lilja Signed-off-by: Laura Abbott Signed-off-by: Russell King Cc: Julien Grall Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/setup.c | 6 ++++++ arch/arm/mm/mmu.c | 9 ++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 8a8051cf57d1..f4e54503afa9 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1093,8 +1093,14 @@ void __init setup_arch(char **cmdline_p) setup_dma_zone(mdesc); xen_early_init(); efi_init(); + /* + * Make sure the calculation for lowmem/highmem is set appropriately + * before reserving/allocating any mmeory + */ adjust_lowmem_bounds(); arm_memblock_init(mdesc); + /* Memory may have been removed so recalculate the bounds. */ + adjust_lowmem_bounds(); early_ioremap_reset(); diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index b8f70a3bb7f8..5cbfd9f86412 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1157,6 +1157,7 @@ void __init adjust_lowmem_bounds(void) phys_addr_t memblock_limit = 0; u64 vmalloc_limit; struct memblock_region *reg; + phys_addr_t lowmem_limit = 0; /* * Let's use our own (unoptimized) equivalent of __pa() that is @@ -1172,14 +1173,14 @@ void __init adjust_lowmem_bounds(void) phys_addr_t block_end = reg->base + reg->size; if (reg->base < vmalloc_limit) { - if (block_end > arm_lowmem_limit) + if (block_end > lowmem_limit) /* * Compare as u64 to ensure vmalloc_limit does * not get truncated. block_end should always * fit in phys_addr_t so there should be no * issue with assignment. */ - arm_lowmem_limit = min_t(u64, + lowmem_limit = min_t(u64, vmalloc_limit, block_end); @@ -1200,12 +1201,14 @@ void __init adjust_lowmem_bounds(void) if (!IS_ALIGNED(block_start, PMD_SIZE)) memblock_limit = block_start; else if (!IS_ALIGNED(block_end, PMD_SIZE)) - memblock_limit = arm_lowmem_limit; + memblock_limit = lowmem_limit; } } } + arm_lowmem_limit = lowmem_limit; + high_memory = __va(arm_lowmem_limit - 1) + 1; /* From a6a7d8ade88ce634f7ff14143714a1301d171001 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Feb 2017 11:25:26 -0800 Subject: [PATCH 221/235] usercopy: Adjust tests to deal with SMAP/PAN commit f5f893c57e37ca730808cb2eee3820abd05e7507 upstream. Under SMAP/PAN/etc, we cannot write directly to userspace memory, so this rearranges the test bytes to get written through copy_to_user(). Additionally drops the bad copy_from_user() test that would trigger a memcpy() against userspace on failure. [arnd: the test module was added in 3.14, and this backported patch should apply cleanly on all version from 3.14 to 4.10. The original patch was in 4.11 on top of a context change I saw the bug triggered with kselftest on a 4.4.y stable kernel] Signed-off-by: Kees Cook Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- lib/test_user_copy.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c index 0ecef3e4690e..5e6db6b1e3bd 100644 --- a/lib/test_user_copy.c +++ b/lib/test_user_copy.c @@ -58,7 +58,9 @@ static int __init test_user_copy_init(void) usermem = (char __user *)user_addr; bad_usermem = (char *)user_addr; - /* Legitimate usage: none of these should fail. */ + /* + * Legitimate usage: none of these copies should fail. + */ ret |= test(copy_from_user(kmem, usermem, PAGE_SIZE), "legitimate copy_from_user failed"); ret |= test(copy_to_user(usermem, kmem, PAGE_SIZE), @@ -68,19 +70,33 @@ static int __init test_user_copy_init(void) ret |= test(put_user(value, (unsigned long __user *)usermem), "legitimate put_user failed"); - /* Invalid usage: none of these should succeed. */ + /* + * Invalid usage: none of these copies should succeed. + */ + + /* Reject kernel-to-kernel copies through copy_from_user(). */ ret |= test(!copy_from_user(kmem, (char __user *)(kmem + PAGE_SIZE), PAGE_SIZE), "illegal all-kernel copy_from_user passed"); + +#if 0 + /* + * When running with SMAP/PAN/etc, this will Oops the kernel + * due to the zeroing of userspace memory on failure. This needs + * to be tested in LKDTM instead, since this test module does not + * expect to explode. + */ ret |= test(!copy_from_user(bad_usermem, (char __user *)kmem, PAGE_SIZE), "illegal reversed copy_from_user passed"); +#endif ret |= test(!copy_to_user((char __user *)kmem, kmem + PAGE_SIZE, PAGE_SIZE), "illegal all-kernel copy_to_user passed"); ret |= test(!copy_to_user((char __user *)kmem, bad_usermem, PAGE_SIZE), "illegal reversed copy_to_user passed"); + ret |= test(!get_user(value, (unsigned long __user *)kmem), "illegal get_user passed"); ret |= test(!put_user(value, (unsigned long __user *)kmem), From 555c443a1ab9f6f6935ad23fa24a795d64220730 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 10 Mar 2017 15:27:57 +0200 Subject: [PATCH 222/235] drm/i915/vbt: don't propagate errors from intel_bios_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 665788572c6410b7efadc2e3009c5d830b6d8ef9 upstream. We don't use the error return for anything other than reporting and logging that there is no VBT. We can pull the logging in the function, and remove the error status return. Moreover, if we needed the information for something later on, we'd probably be better off storing the bit in dev_priv, and using it where it's needed, instead of using the error return. While at it, improve the comments. Cc: Manasi Navare Cc: Ville Syrjälä Reviewed-by: Chris Wilson Signed-off-by: Jani Nikula Link: http://patchwork.freedesktop.org/patch/msgid/438ebbb0d5f0d321c625065b9cc78532a1dab24f.1489152288.git.jani.nikula@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_drv.c | 4 +--- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/intel_bios.c | 31 ++++++++++++++++--------------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 923150de46cb..ca6efb69ef66 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -573,9 +573,7 @@ static int i915_load_modeset_init(struct drm_device *dev) if (i915_inject_load_failure()) return -ENODEV; - ret = intel_bios_init(dev_priv); - if (ret) - DRM_INFO("failed to find VBIOS tables\n"); + intel_bios_init(dev_priv); /* If we have > 1 VGA cards, then we need to arbitrate access * to the common VGA resources. diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e0d72457b23c..36a665f0e5c9 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3584,7 +3584,7 @@ static inline bool intel_gmbus_is_forced_bit(struct i2c_adapter *adapter) extern void intel_i2c_reset(struct drm_device *dev); /* intel_bios.c */ -int intel_bios_init(struct drm_i915_private *dev_priv); +void intel_bios_init(struct drm_i915_private *dev_priv); bool intel_bios_is_valid_vbt(const void *buf, size_t size); bool intel_bios_is_tv_present(struct drm_i915_private *dev_priv); bool intel_bios_is_lvds_present(struct drm_i915_private *dev_priv, u8 *i2c_pin); diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index cf2560708e03..954eabbf0347 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1450,36 +1450,35 @@ static const struct vbt_header *find_vbt(void __iomem *bios, size_t size) * intel_bios_init - find VBT and initialize settings from the BIOS * @dev_priv: i915 device instance * - * Loads the Video BIOS and checks that the VBT exists. Sets scratch registers - * to appropriate values. - * - * Returns 0 on success, nonzero on failure. + * Parse and initialize settings from the Video BIOS Tables (VBT). If the VBT + * was not found in ACPI OpRegion, try to find it in PCI ROM first. Also + * initialize some defaults if the VBT is not present at all. */ -int -intel_bios_init(struct drm_i915_private *dev_priv) +void intel_bios_init(struct drm_i915_private *dev_priv) { struct pci_dev *pdev = dev_priv->drm.pdev; const struct vbt_header *vbt = dev_priv->opregion.vbt; const struct bdb_header *bdb; u8 __iomem *bios = NULL; - if (HAS_PCH_NOP(dev_priv)) - return -ENODEV; + if (HAS_PCH_NOP(dev_priv)) { + DRM_DEBUG_KMS("Skipping VBT init due to disabled display.\n"); + return; + } init_vbt_defaults(dev_priv); + /* If the OpRegion does not have VBT, look in PCI ROM. */ if (!vbt) { size_t size; bios = pci_map_rom(pdev, &size); if (!bios) - return -1; + goto out; vbt = find_vbt(bios, size); - if (!vbt) { - pci_unmap_rom(pdev, bios); - return -1; - } + if (!vbt) + goto out; DRM_DEBUG_KMS("Found valid VBT in PCI ROM\n"); } @@ -1504,10 +1503,12 @@ intel_bios_init(struct drm_i915_private *dev_priv) parse_mipi_sequence(dev_priv, bdb); parse_ddi_ports(dev_priv, bdb); +out: + if (!vbt) + DRM_INFO("Failed to find VBIOS tables (VBT)\n"); + if (bios) pci_unmap_rom(pdev, bios); - - return 0; } /** From 09fcb3561d9ede40610d8f7b7fd64d2651179403 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 10 Mar 2017 15:27:58 +0200 Subject: [PATCH 223/235] drm/i915/vbt: split out defaults that are set when there is no VBT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bb1d132935c2f87cd261eb559759fe49d5e5dc43 upstream. The main thing are the DDI ports. If there's a VBT that says there are no outputs, we should trust that, and not have semi-random defaults. Unfortunately, the defaults have resulted in some Chromebooks without VBT to rely on this behaviour, so we split out the defaults for the missing VBT case. Reviewed-by: Manasi Navare Cc: Manasi Navare Cc: Ville Syrjälä Signed-off-by: Jani Nikula Link: http://patchwork.freedesktop.org/patch/msgid/95c26079ff640d43f53b944f17e9fc356b36daec.1489152288.git.jani.nikula@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_bios.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index 954eabbf0347..4ac36e3c341f 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1332,6 +1332,7 @@ parse_device_mapping(struct drm_i915_private *dev_priv, return; } +/* Common defaults which may be overridden by VBT. */ static void init_vbt_defaults(struct drm_i915_private *dev_priv) { @@ -1368,6 +1369,18 @@ init_vbt_defaults(struct drm_i915_private *dev_priv) &dev_priv->vbt.ddi_port_info[port]; info->hdmi_level_shift = HDMI_LEVEL_SHIFT_UNKNOWN; + } +} + +/* Defaults to initialize only if there is no VBT. */ +static void +init_vbt_missing_defaults(struct drm_i915_private *dev_priv) +{ + enum port port; + + for (port = PORT_A; port < I915_MAX_PORTS; port++) { + struct ddi_vbt_port_info *info = + &dev_priv->vbt.ddi_port_info[port]; info->supports_dvi = (port != PORT_A && port != PORT_E); info->supports_hdmi = info->supports_dvi; @@ -1504,8 +1517,10 @@ void intel_bios_init(struct drm_i915_private *dev_priv) parse_ddi_ports(dev_priv, bdb); out: - if (!vbt) + if (!vbt) { DRM_INFO("Failed to find VBIOS tables (VBT)\n"); + init_vbt_missing_defaults(dev_priv); + } if (bios) pci_unmap_rom(pdev, bios); From afe8d4a51c763b9da32c993295af71252bbb7a2c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 2 Mar 2017 14:03:20 +0530 Subject: [PATCH 224/235] cpufreq: schedutil: move cached_raw_freq to struct sugov_policy commit 6c4f0fa643cb9e775dcc976e3db00d649468ff1d upstream. cached_raw_freq applies to the entire cpufreq policy and not individual CPUs. Apart from wasting per-cpu memory, it is actually wrong to keep it in struct sugov_cpu as we may end up comparing next_freq with a stale cached_raw_freq of a random CPU. Move cached_raw_freq to struct sugov_policy. Fixes: 5cbea46984d6 (cpufreq: schedutil: map raw required frequency to driver frequency) Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/sched/cpufreq_schedutil.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 69e06898997d..6415c04be3f8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -32,6 +32,7 @@ struct sugov_policy { u64 last_freq_update_time; s64 freq_update_delay_ns; unsigned int next_freq; + unsigned int cached_raw_freq; /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; @@ -46,7 +47,6 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; - unsigned int cached_raw_freq; unsigned long iowait_boost; unsigned long iowait_boost_max; u64 last_update; @@ -140,9 +140,9 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) return sg_policy->next_freq; - sg_cpu->cached_raw_freq = freq; + sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -502,6 +502,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = UINT_MAX; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); @@ -512,7 +513,6 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->max = 0; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->last_update = 0; - sg_cpu->cached_raw_freq = 0; sg_cpu->iowait_boost = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, From a8fc3159ee2c5aa0f557bc4581f8a32461f74407 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 19 Mar 2017 14:30:02 +0100 Subject: [PATCH 225/235] cpufreq: schedutil: Fix per-CPU structure initialization in sugov_start() commit 4296f23ed49a15d36949458adcc66ff993dee2a8 upstream. sugov_start() only initializes struct sugov_cpu per-CPU structures for shared policies, but it should do that for single-CPU policies too. That in particular makes the IO-wait boost mechanism work in the cases when cpufreq policies correspond to individual CPUs. Fixes: 21ca6d2c52f8 (cpufreq: schedutil: Add iowait boosting) Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/sched/cpufreq_schedutil.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6415c04be3f8..cb771c76682e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -507,20 +507,14 @@ static int sugov_start(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->sg_policy = sg_policy; - if (policy_is_shared(policy)) { - sg_cpu->util = 0; - sg_cpu->max = 0; - sg_cpu->flags = SCHED_CPUFREQ_RT; - sg_cpu->last_update = 0; - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_shared); - } else { - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_single); - } + sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + policy_is_shared(policy) ? + sugov_update_shared : + sugov_update_single); } return 0; } From 3eb235a1af142e09f48d022970cfbde20603af9e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 May 2017 00:37:10 +0200 Subject: [PATCH 226/235] netfilter: nft_set_rbtree: handle element re-addition after deletion commit d2df92e98a34a5619dadd29c6291113c009181e7 upstream. The existing code selects no next branch to be inspected when re-inserting an inactive element into the rb-tree, looping endlessly. This patch restricts the check for active elements to the EEXIST case only. Fixes: e701001e7cbe ("netfilter: nft_rbtree: allow adjacent intervals with dynamic updates") Reported-by: Wolfgang Bumiller Tested-by: Wolfgang Bumiller Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nft_set_rbtree.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 36493a7cae88..93820e0d8814 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -118,17 +118,17 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, else if (d > 0) p = &parent->rb_right; else { - if (nft_set_elem_active(&rbe->ext, genmask)) { - if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(new)) - p = &parent->rb_left; - else if (!nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_end(new)) - p = &parent->rb_right; - else { - *ext = &rbe->ext; - return -EEXIST; - } + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(new)) { + p = &parent->rb_left; + } else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(new)) { + p = &parent->rb_right; + } else if (nft_set_elem_active(&rbe->ext, genmask)) { + *ext = &rbe->ext; + return -EEXIST; + } else { + p = &parent->rb_left; } } } From 05afd4c0af6a43f6bda7caaacb01bc0116d50d3b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 14 Jun 2017 15:06:16 +0200 Subject: [PATCH 227/235] Linux 4.9.32 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3601995f63f9..3d8781997968 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 31 +SUBLEVEL = 32 EXTRAVERSION = NAME = Roaring Lionus From 2b3da4a6cae657b4b5595f63f4352430fd061929 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Thu, 25 May 2017 15:20:29 +0800 Subject: [PATCH 228/235] ANDROID: uid_sys_stats: check previous uid_entry before call find_or_register_uid Theads in a process are stored in list struct task_struct->thread_group, so it will be visited continiously in below loop: do_each_thread(temp, task) { ... } while_each_thread(temp, task); I add some log in the loop, we can see below information: [ 65.033561] uid 1000, uid_entry ffffffc0f2761600 [ 65.033567] uid 1000, uid_entry ffffffc0f2761600 [ 65.033574] uid 1000, uid_entry ffffffc0f2761600 [ 65.033581] uid 1000, uid_entry ffffffc0f2761600 [ 65.033588] uid 1000, uid_entry ffffffc0f2761600 [ 65.033595] uid 1000, uid_entry ffffffc0f2761600 [ 65.033602] uid 1000, uid_entry ffffffc0f2761600 [ 65.033609] uid 1000, uid_entry ffffffc0f2761600 [ 65.033615] uid 1000, uid_entry ffffffc0f2761600 [ 65.033622] uid 1000, uid_entry ffffffc0f2761600 [ 65.033629] uid 1000, uid_entry ffffffc0f2761600 [ 65.033637] uid 1000, uid_entry ffffffc0f2761600 [ 65.033644] uid 1000, uid_entry ffffffc0f2761600 [ 65.033651] uid 1000, uid_entry ffffffc0f2761600 [ 65.033658] uid 1000, uid_entry ffffffc0f2761600 [ 65.033665] uid 1000, uid_entry ffffffc0f2761600 [ 65.033672] uid 1000, uid_entry ffffffc0f2761600 [ 65.033680] uid 1000, uid_entry ffffffc0f2761600 [ 65.033687] uid 1000, uid_entry ffffffc0f2761600 [ 65.033694] uid 1000, uid_entry ffffffc0f2761600 [ 65.033701] uid 1000, uid_entry ffffffc0f2761600 [ 65.033708] uid 1000, uid_entry ffffffc0f2761600 [ 65.033715] uid 1000, uid_entry ffffffc0f2761600 [ 65.033722] uid 1000, uid_entry ffffffc0f2761600 [ 65.033729] uid 1000, uid_entry ffffffc0f2761600 [ 65.033736] uid 1000, uid_entry ffffffc0f2761600 [ 65.033743] uid 1000, uid_entry ffffffc0f2761600 [ 65.033750] uid 1000, uid_entry ffffffc0f2761600 [ 65.033757] uid 1000, uid_entry ffffffc0f2761600 [ 65.033763] uid 1000, uid_entry ffffffc0f2761600 [ 65.033770] uid 1000, uid_entry ffffffc0f2761600 [ 65.033777] uid 1000, uid_entry ffffffc0f2761600 [ 65.033784] uid 1000, uid_entry ffffffc0f2761600 [ 65.033791] uid 1000, uid_entry ffffffc0f2761600 [ 65.033798] uid 1000, uid_entry ffffffc0f2761600 So we can check the previous uid_entry before calling find_or_register_uid to save time. Change-Id: I05ec1a1405a80c0a620cb4b4b2f6483dbfde7829 Signed-off-by: Ganesh Mahendran --- drivers/misc/uid_sys_stats.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 871040e17b6b..8bf4c57dba5c 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -95,7 +95,7 @@ static struct uid_entry *find_or_register_uid(uid_t uid) static int uid_cputime_show(struct seq_file *m, void *v) { - struct uid_entry *uid_entry; + struct uid_entry *uid_entry = NULL; struct task_struct *task, *temp; struct user_namespace *user_ns = current_user_ns(); cputime_t utime; @@ -113,7 +113,8 @@ static int uid_cputime_show(struct seq_file *m, void *v) read_lock(&tasklist_lock); do_each_thread(temp, task) { uid = from_kuid_munged(user_ns, task_uid(task)); - uid_entry = find_or_register_uid(uid); + if (!uid_entry || uid_entry->uid != uid) + uid_entry = find_or_register_uid(uid); if (!uid_entry) { read_unlock(&tasklist_lock); rt_mutex_unlock(&uid_lock); @@ -252,7 +253,7 @@ static void compute_uid_io_bucket_stats(struct io_stats *io_bucket, static void update_io_stats_all_locked(void) { - struct uid_entry *uid_entry; + struct uid_entry *uid_entry = NULL; struct task_struct *task, *temp; struct user_namespace *user_ns = current_user_ns(); unsigned long bkt; @@ -265,7 +266,8 @@ static void update_io_stats_all_locked(void) rcu_read_lock(); do_each_thread(temp, task) { uid = from_kuid_munged(user_ns, task_uid(task)); - uid_entry = find_or_register_uid(uid); + if (!uid_entry || uid_entry->uid != uid) + uid_entry = find_or_register_uid(uid); if (!uid_entry) continue; add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR); From edb6da11b95c34269d21e99e3b738da1114bdb5f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 6 Jun 2017 18:38:04 +0200 Subject: [PATCH 229/235] FROMLIST: bpf: cgroup skb progs cannot access ld_abs/ind Commit fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff") enabled programs of BPF_PROG_TYPE_CGROUP_SKB type to use ld_abs/ind instructions. However, at this point, we cannot use them, since offsets relative to SKF_LL_OFF will end up pointing skb_mac_header(skb) out of bounds since in the egress path it is not yet set at that point in time, but only after __dev_queue_xmit() did a general reset on the mac header. bpf_internal_load_pointer_neg_helper() will then end up reading data from a wrong offset. BPF_PROG_TYPE_CGROUP_SKB programs can use bpf_skb_load_bytes() already to access packet data, which is also more flexible than the insns carried over from cBPF. Fixes: fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Chenbo Feng Signed-off-by: David S. Miller (url: http://patchwork.ozlabs.org/patch/771946/) Signed-off-by: Chenbo Feng Bug: 30950746 Change-Id: Ia32ac79d8c0d18f811ec101897284a8b60cb042a --- kernel/bpf/verifier.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fe158bd01dc6..44c17f47d94c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2164,7 +2164,6 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; From 2c4f58af89edfb798e2212618a5c4337010dd431 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Fri, 9 Jun 2017 12:06:07 -0700 Subject: [PATCH 230/235] FROMLIST: ipv6: Initial skb->dev and skb->protocol in ip6_output Move the initialization of skb->dev and skb->protocol from ip6_finish_output2 to ip6_output. This can make the skb->dev and skb->protocol information avalaible to the CGROUP eBPF filter. Signed-off-by: Chenbo Feng Acked-by: Eric Dumazet Signed-off-by: David S. Miller (url: http://patchwork.ozlabs.org/patch/774124/) Bug: 30950746 Change-Id: Iac2304f7ba8cd769ee01a062cde2deb50562c3ad --- net/ipv6/ip6_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d472a5fd4da4..1b2ed0ea5eb1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -67,9 +67,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * struct in6_addr *nexthop; int ret; - skb->protocol = htons(ETH_P_IPV6); - skb->dev = dev; - if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); @@ -153,6 +150,9 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + if (unlikely(idev->cnf.disable_ipv6)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); From e22ff4d119bc7708d51af23cb78b41b8c6310515 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Fri, 9 Jun 2017 12:17:37 -0700 Subject: [PATCH 231/235] FROMLIST: bpf: Remove duplicate tcp_filter hook in ipv6 There are two tcp_filter hooks in tcp_ipv6 ingress path currently. One is at tcp_v6_rcv and another is in tcp_v6_do_rcv. It seems the tcp_filter() call inside tcp_v6_do_rcv is redundent and some packet will be filtered twice in this situation. This will cause trouble when using eBPF filters to account traffic data. Signed-off-by: Chenbo Feng Acked-by: Eric Dumazet Signed-off-by: David S. Miller (url: http://patchwork.ozlabs.org/patch/774126/) Bug: 30950746 Change-Id: Id4fe8cd5b7bac11a4d4141e203dd4b9fa59f3d6c --- net/ipv6/tcp_ipv6.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 368c23a44607..f54c7d3469ad 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1237,9 +1237,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); - if (tcp_filter(sk, skb)) - goto discard; - /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. From d937a80c5f34fb2e932c36cf27c82f5b0b00d05f Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Sat, 10 Jun 2017 12:35:38 -0700 Subject: [PATCH 232/235] FROMLIST: Remove the redundant skb->dev initialization in ip6_fragment After moves the skb->dev and skb->protocol initialization into ip6_output, setting the skb->dev inside ip6_fragment is unnecessary. Fixes: 97a7a37a7b7b("ipv6: Initial skb->dev and skb->protocol in ip6_output") Signed-off-by: Chenbo Feng Signed-off-by: David S. Miller (url: http://patchwork.ozlabs.org/patch/774260/) Bug: 30950746 Change-Id: I6ab42ecca2e2ab57f2c5988edf19d584de35e007 --- net/ipv6/ip6_output.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1b2ed0ea5eb1..7cec18062244 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -867,7 +867,6 @@ fail_toobig: if (skb->sk && dst_allfrag(skb_dst(skb))) sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); - skb->dev = skb_dst(skb)->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); err = -EMSGSIZE; From f3a511912a7b56768101244a632c95d2841714d3 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 8 Dec 2016 19:55:22 +0800 Subject: [PATCH 233/235] usb: gadget: f_fs: Fix possibe deadlock When system try to close /dev/usb-ffs/adb/ep0 on one core, at the same time another core try to attach new UDC, which will cause deadlock as below scenario. Thus we should release ffs lock before issuing unregister_gadget_item(). [ 52.642225] c1 ====================================================== [ 52.642228] c1 [ INFO: possible circular locking dependency detected ] [ 52.642236] c1 4.4.6+ #1 Tainted: G W O [ 52.642241] c1 ------------------------------------------------------- [ 52.642245] c1 usb ffs open/2808 is trying to acquire lock: [ 52.642270] c0 (udc_lock){+.+.+.}, at: [] usb_gadget_unregister_driver+0x3c/0xc8 [ 52.642272] c1 but task is already holding lock: [ 52.642283] c0 (ffs_lock){+.+.+.}, at: [] ffs_data_clear+0x30/0x140 [ 52.642285] c1 which lock already depends on the new lock. [ 52.642287] c1 the existing dependency chain (in reverse order) is: [ 52.642295] c0 -> #1 (ffs_lock){+.+.+.}: [ 52.642307] c0 [] __lock_acquire+0x20f0/0x2238 [ 52.642314] c0 [] lock_acquire+0xe4/0x298 [ 52.642322] c0 [] mutex_lock_nested+0x7c/0x3cc [ 52.642328] c0 [] ffs_func_bind+0x504/0x6e8 [ 52.642334] c0 [] usb_add_function+0x84/0x184 [ 52.642340] c0 [] configfs_composite_bind+0x264/0x39c [ 52.642346] c0 [] udc_bind_to_driver+0x58/0x11c [ 52.642352] c0 [] usb_udc_attach_driver+0x90/0xc8 [ 52.642358] c0 [] gadget_dev_desc_UDC_store+0xd4/0x128 [ 52.642369] c0 [] configfs_write_file+0xd0/0x13c [ 52.642376] c0 [] vfs_write+0xb8/0x214 [ 52.642381] c0 [] SyS_write+0x54/0xb0 [ 52.642388] c0 [] el0_svc_naked+0x24/0x28 [ 52.642395] c0 -> #0 (udc_lock){+.+.+.}: [ 52.642401] c0 [] print_circular_bug+0x84/0x2e4 [ 52.642407] c0 [] __lock_acquire+0x2138/0x2238 [ 52.642412] c0 [] lock_acquire+0xe4/0x298 [ 52.642420] c0 [] mutex_lock_nested+0x7c/0x3cc [ 52.642427] c0 [] usb_gadget_unregister_driver+0x3c/0xc8 [ 52.642432] c0 [] unregister_gadget_item+0x28/0x44 [ 52.642439] c0 [] ffs_data_clear+0x138/0x140 [ 52.642444] c0 [] ffs_data_reset+0x20/0x6c [ 52.642450] c0 [] ffs_data_closed+0xac/0x12c [ 52.642454] c0 [] ffs_ep0_release+0x20/0x2c [ 52.642460] c0 [] __fput+0xb0/0x1f4 [ 52.642466] c0 [] ____fput+0x20/0x2c [ 52.642473] c0 [] task_work_run+0xb4/0xe8 [ 52.642482] c0 [] do_exit+0x360/0xb9c [ 52.642487] c0 [] do_group_exit+0x4c/0xb0 [ 52.642494] c0 [] get_signal+0x380/0x89c [ 52.642501] c0 [] do_signal+0x154/0x518 [ 52.642507] c0 [] do_notify_resume+0x70/0x78 [ 52.642512] c0 [] work_pending+0x1c/0x20 [ 52.642514] c1 other info that might help us debug this: [ 52.642517] c1 Possible unsafe locking scenario: [ 52.642518] c1 CPU0 CPU1 [ 52.642520] c1 ---- ---- [ 52.642525] c0 lock(ffs_lock); [ 52.642529] c0 lock(udc_lock); [ 52.642533] c0 lock(ffs_lock); [ 52.642537] c0 lock(udc_lock); [ 52.642539] c1 *** DEADLOCK *** [ 52.642543] c1 1 lock held by usb ffs open/2808: [ 52.642555] c0 #0: (ffs_lock){+.+.+.}, at: [] ffs_data_clear+0x30/0x140 [ 52.642557] c1 stack backtrace: [ 52.642563] c1 CPU: 1 PID: 2808 Comm: usb ffs open Tainted: G [ 52.642565] c1 Hardware name: Spreadtrum SP9860g Board (DT) [ 52.642568] c1 Call trace: [ 52.642573] c1 [] dump_backtrace+0x0/0x170 [ 52.642577] c1 [] show_stack+0x20/0x28 [ 52.642583] c1 [] dump_stack+0xa8/0xe0 [ 52.642587] c1 [] print_circular_bug+0x1fc/0x2e4 [ 52.642591] c1 [] __lock_acquire+0x2138/0x2238 [ 52.642595] c1 [] lock_acquire+0xe4/0x298 [ 52.642599] c1 [] mutex_lock_nested+0x7c/0x3cc [ 52.642604] c1 [] usb_gadget_unregister_driver+0x3c/0xc8 [ 52.642608] c1 [] unregister_gadget_item+0x28/0x44 [ 52.642613] c1 [] ffs_data_clear+0x138/0x140 [ 52.642618] c1 [] ffs_data_reset+0x20/0x6c [ 52.642621] c1 [] ffs_data_closed+0xac/0x12c [ 52.642625] c1 [] ffs_ep0_release+0x20/0x2c [ 52.642629] c1 [] __fput+0xb0/0x1f4 [ 52.642633] c1 [] ____fput+0x20/0x2c [ 52.642636] c1 [] task_work_run+0xb4/0xe8 [ 52.642640] c1 [] do_exit+0x360/0xb9c [ 52.642644] c1 [] do_group_exit+0x4c/0xb0 [ 52.642647] c1 [] get_signal+0x380/0x89c [ 52.642651] c1 [] do_signal+0x154/0x518 [ 52.642656] c1 [] do_notify_resume+0x70/0x78 [ 52.642659] c1 [] work_pending+0x1c/0x20 Bug: 62572621 Change-Id: I78adea011c3a87dfeb5c15750b7737975fda1742 Acked-by: Michal Nazarewicz Signed-off-by: Baolin Wang Signed-off-by: Felipe Balbi (cherry picked from commit b3ce3ce02d146841af012d08506b4071db8ffde3) Signed-off-by: Jerry Zhang --- drivers/usb/gadget/function/f_fs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 89081b834615..8a788d69e4d5 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3688,6 +3688,7 @@ static void ffs_closed(struct ffs_data *ffs) { struct ffs_dev *ffs_obj; struct f_fs_opts *opts; + struct config_item *ci; ENTER(); ffs_dev_lock(); @@ -3711,8 +3712,11 @@ static void ffs_closed(struct ffs_data *ffs) || !atomic_read(&opts->func_inst.group.cg_item.ci_kref.refcount)) goto done; - unregister_gadget_item(ffs_obj->opts-> - func_inst.group.cg_item.ci_parent->ci_parent); + ci = opts->func_inst.group.cg_item.ci_parent->ci_parent; + ffs_dev_unlock(); + + unregister_gadget_item(ci); + return; done: ffs_dev_unlock(); } From dc7257223f0d18e9a99a7e523fd7c50fc9e0f7fc Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Tue, 13 Jun 2017 09:23:25 -0700 Subject: [PATCH 234/235] ANDROID: android-base.cfg: split out arm64-specific configs These config options are specific to arm64 so should not be universally required. Bug: 62523096 Change-Id: Ic5f35db71d73919f2958120f45dd717f5d05f4c5 Signed-off-by: Steve Muckle --- kernel/configs/android-base-arm64.cfg | 5 +++++ kernel/configs/android-base.config | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 kernel/configs/android-base-arm64.cfg diff --git a/kernel/configs/android-base-arm64.cfg b/kernel/configs/android-base-arm64.cfg new file mode 100644 index 000000000000..43f23d6b5391 --- /dev/null +++ b/kernel/configs/android-base-arm64.cfg @@ -0,0 +1,5 @@ +# KEEP ALPHABETICALLY SORTED +CONFIG_ARMV8_DEPRECATED=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_SETEND_EMULATION=y +CONFIG_SWP_EMULATION=y diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index fb6017e1a869..301e1a6c33b1 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -12,7 +12,6 @@ CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_LOW_MEMORY_KILLER=y -CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y CONFIG_AUDIT=y CONFIG_BLK_DEV_INITRD=y @@ -22,7 +21,6 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_BPF=y -CONFIG_CP15_BARRIER_EMULATION=y CONFIG_DEFAULT_SECURITY_SELINUX=y CONFIG_EMBEDDED=y CONFIG_FB=y @@ -156,9 +154,7 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y CONFIG_SECURITY_SELINUX=y -CONFIG_SETEND_EMULATION=y CONFIG_STAGING=y -CONFIG_SWP_EMULATION=y CONFIG_SYNC=y CONFIG_TUN=y CONFIG_UID_SYS_STATS=y From 0bdd09ad31a1068589c94444105fa64581b0d794 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Tue, 16 May 2017 16:48:49 -0700 Subject: [PATCH 235/235] ANDROID: sdcardfs: remove dead function open_flags_to_access_mode() smatch warns about the suspicious formatting in the last line of open_flags_to_access_mode(). It turns out the only caller was deleted over a year ago by "ANDROID: sdcardfs: Bring up to date with Android M permissions:", so we can "fix" the function's formatting by deleting it. Change-Id: Id85946f3eb01722eef35b1815f405a6fda3aa4ff Signed-off-by: Greg Hackmann --- fs/sdcardfs/packagelist.c | 13 ------------- fs/sdcardfs/sdcardfs.h | 1 - 2 files changed, 14 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 00a0f656acc7..6da0c2186d39 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -174,19 +174,6 @@ int check_caller_access_to_name(struct inode *parent_node, const struct qstr *na return 1; } -/* This function is used when file opening. The open flags must be - * checked before calling check_caller_access_to_name() - */ -int open_flags_to_access_mode(int open_flags) -{ - if ((open_flags & O_ACCMODE) == O_RDONLY) - return 0; /* R_OK */ - if ((open_flags & O_ACCMODE) == O_WRONLY) - return 1; /* W_OK */ - /* Probably O_RDRW, but treat as default to be safe */ - return 1; /* R_OK | W_OK */ -} - static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key, appid_t value) { diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 3687b22a2e6b..4e0ce49a906d 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -499,7 +499,6 @@ extern appid_t get_appid(const char *app_name); extern appid_t get_ext_gid(const char *app_name); extern appid_t is_excluded(const char *app_name, userid_t userid); extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name); -extern int open_flags_to_access_mode(int open_flags); extern int packagelist_init(void); extern void packagelist_exit(void);