From 608d3348e0abac289a11ce2b650b59eb39f8f967 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 19 Mar 2015 14:17:42 +0100 Subject: [PATCH 001/236] iio: mxs-lradc: fix merge error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e7f3db14eacaf1993a70b1517582603dfdf34988 (89bb35e200bee745c539a96666e0792301ca40f1 upstream) was backported incorrectly by me, so fix it up, as the driver is now broken. Sorry about that. Reported-by: Kristina Martšenko Cc: Marek Vasut Cc: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/staging/iio/adc/mxs-lradc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/staging/iio/adc/mxs-lradc.c b/drivers/staging/iio/adc/mxs-lradc.c index be89260c23a6..27e1a6e62d06 100644 --- a/drivers/staging/iio/adc/mxs-lradc.c +++ b/drivers/staging/iio/adc/mxs-lradc.c @@ -1159,7 +1159,6 @@ static irqreturn_t mxs_lradc_handle_irq(int irq, void *data) LRADC_CTRL1_LRADC_IRQ(TOUCHSCREEN_VCHANNEL2)); } - if (iio_buffer_enabled(iio)) if (iio_buffer_enabled(iio)) { if (reg & lradc->buffer_vchans) iio_trigger_poll(iio->trig, iio_get_time_ns()); From 8df06a353b6ac3ceb942364a343ae44fcc93c363 Mon Sep 17 00:00:00 2001 From: Andreas Larsson Date: Thu, 18 Dec 2014 13:23:23 +0100 Subject: [PATCH 002/236] sparc32: destroy_context() and switch_mm() needs to disable interrupts. [ Upstream commit 66d0f7ec9f1038452178b1993fc07fd96d30fd38 ] Load balancing can be triggered in the critical sections protected by srmmu_context_spinlock in destroy_context() and switch_mm() and can hang the cpu waiting for the rq lock of another cpu that in turn has called switch_mm hangning on srmmu_context_spinlock leading to deadlock. So, disable interrupt while taking srmmu_context_spinlock in destroy_context() and switch_mm() so we don't deadlock. See also commit 77b838fa1ef0 ("[SPARC64]: destroy_context() needs to disable interrupts.") Signed-off-by: Andreas Larsson Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/mm/srmmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index cfbe53c17b0d..09daebdee552 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -460,10 +460,12 @@ static void __init sparc_context_init(int numctx) void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, struct task_struct *tsk) { + unsigned long flags; + if (mm->context == NO_CONTEXT) { - spin_lock(&srmmu_context_spinlock); + spin_lock_irqsave(&srmmu_context_spinlock, flags); alloc_context(old_mm, mm); - spin_unlock(&srmmu_context_spinlock); + spin_unlock_irqrestore(&srmmu_context_spinlock, flags); srmmu_ctxd_set(&srmmu_context_table[mm->context], mm->pgd); } @@ -988,14 +990,15 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) void destroy_context(struct mm_struct *mm) { + unsigned long flags; if (mm->context != NO_CONTEXT) { flush_cache_mm(mm); srmmu_ctxd_set(&srmmu_context_table[mm->context], srmmu_swapper_pg_dir); flush_tlb_mm(mm); - spin_lock(&srmmu_context_spinlock); + spin_lock_irqsave(&srmmu_context_spinlock, flags); free_context(mm->context); - spin_unlock(&srmmu_context_spinlock); + spin_unlock_irqrestore(&srmmu_context_spinlock, flags); mm->context = NO_CONTEXT; } } From 61b30f78cfbcc44800857e0da1a9d2372ffab86e Mon Sep 17 00:00:00 2001 From: Rob Gardner Date: Mon, 2 Mar 2015 23:16:55 -0700 Subject: [PATCH 003/236] sparc: semtimedop() unreachable due to comparison error [ Upstream commit 53eb2516972b8c4628651dfcb926cb9ef8b2864a ] A bug was reported that the semtimedop() system call was always failing eith ENOSYS. Since SEMCTL is defined as 3, and SEMTIMEDOP is defined as 4, the comparison "call <= SEMCTL" will always prevent SEMTIMEDOP from getting through to the semaphore ops switch statement. This is corrected by changing the comparison to "call <= SEMTIMEDOP". Orabug: 20633375 Signed-off-by: Rob Gardner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/kernel/sys_sparc_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index beb0b5a5f21f..25db14a33d03 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -332,7 +332,7 @@ SYSCALL_DEFINE6(sparc_ipc, unsigned int, call, int, first, unsigned long, second long err; /* No need for backward compatibility. We can start fresh... */ - if (call <= SEMCTL) { + if (call <= SEMTIMEDOP) { switch (call) { case SEMOP: err = sys_semtimedop(first, ptr, From e6f1c6bb4214265610cd351f91ba27feffa67ffa Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 19 Mar 2015 16:05:57 -0400 Subject: [PATCH 004/236] sparc: perf: Remove redundant perf_pmu_{en|dis}able calls [ Upstream commit 5b0d4b5514bbcce69b516d0742f2cfc84ebd6db3 ] perf_pmu_disable is called by core perf code before pmu->del and the enable function is called by core perf code afterwards. No need to call again within sparc_pmu_del. Ditto for pmu->add and sparc_pmu_add. Signed-off-by: David Ahern Acked-by: Bob Picco Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/kernel/perf_event.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 617b9fe33771..102199579d67 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1101,7 +1101,6 @@ static void sparc_pmu_del(struct perf_event *event, int _flags) int i; local_irq_save(flags); - perf_pmu_disable(event->pmu); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event[i]) { @@ -1127,7 +1126,6 @@ static void sparc_pmu_del(struct perf_event *event, int _flags) } } - perf_pmu_enable(event->pmu); local_irq_restore(flags); } @@ -1361,7 +1359,6 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags) unsigned long flags; local_irq_save(flags); - perf_pmu_disable(event->pmu); n0 = cpuc->n_events; if (n0 >= sparc_pmu->max_hw_events) @@ -1394,7 +1391,6 @@ nocheck: ret = 0; out: - perf_pmu_enable(event->pmu); local_irq_restore(flags); return ret; } From c6bf6b54c72ecadd23107f83d8d23aac2b8bac83 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 19 Mar 2015 16:06:17 -0400 Subject: [PATCH 005/236] sparc: perf: Make counting mode actually work [ Upstream commit d51291cb8f32bfae6b331e1838651f3ddefa73a5 ] Currently perf-stat (aka, counting mode) does not work: $ perf stat ls ... Performance counter stats for 'ls': 1.585665 task-clock (msec) # 0.580 CPUs utilized 24 context-switches # 0.015 M/sec 0 cpu-migrations # 0.000 K/sec 86 page-faults # 0.054 M/sec cycles stalled-cycles-frontend stalled-cycles-backend instructions branches branch-misses 0.002735100 seconds time elapsed The reason is that state is never reset (stays with PERF_HES_UPTODATE set). Add a call to sparc_pmu_enable_event during the added_event handling. Clean up the encoding since pmu_start calls sparc_pmu_enable_event which does the same. Passing PERF_EF_RELOAD to sparc_pmu_start means the call to sparc_perf_event_set_period can be removed as well. With this patch: $ perf stat ls ... Performance counter stats for 'ls': 1.552890 task-clock (msec) # 0.552 CPUs utilized 24 context-switches # 0.015 M/sec 0 cpu-migrations # 0.000 K/sec 86 page-faults # 0.055 M/sec 5,748,997 cycles # 3.702 GHz stalled-cycles-frontend:HG stalled-cycles-backend:HG 1,684,362 instructions:HG # 0.29 insns per cycle 295,133 branches:HG # 190.054 M/sec 28,007 branch-misses:HG # 9.49% of all branches 0.002815665 seconds time elapsed Signed-off-by: David Ahern Acked-by: Bob Picco Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/kernel/perf_event.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 102199579d67..3ccb6777a7e1 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -960,6 +960,8 @@ out: cpuc->pcr[0] |= cpuc->event[0]->hw.config_base; } +static void sparc_pmu_start(struct perf_event *event, int flags); + /* On this PMU each PIC has it's own PCR control register. */ static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc) { @@ -972,20 +974,13 @@ static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc) struct perf_event *cp = cpuc->event[i]; struct hw_perf_event *hwc = &cp->hw; int idx = hwc->idx; - u64 enc; if (cpuc->current_idx[i] != PIC_NO_INDEX) continue; - sparc_perf_event_set_period(cp, hwc, idx); cpuc->current_idx[i] = idx; - enc = perf_event_get_enc(cpuc->events[i]); - cpuc->pcr[idx] &= ~mask_for_index(idx); - if (hwc->state & PERF_HES_STOPPED) - cpuc->pcr[idx] |= nop_for_index(idx); - else - cpuc->pcr[idx] |= event_encoding(enc, idx); + sparc_pmu_start(cp, PERF_EF_RELOAD); } out: for (i = 0; i < cpuc->n_events; i++) { From cdbe631ba36015f619ba716f48dc46e4455d836d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 19 Mar 2015 16:06:53 -0400 Subject: [PATCH 006/236] sparc: Touch NMI watchdog when walking cpus and calling printk [ Upstream commit 31aaa98c248da766ece922bbbe8cc78cfd0bc920 ] With the increase in number of CPUs calls to functions that dump output to console (e.g., arch_trigger_all_cpu_backtrace) can take a long time to complete. If IRQs are disabled eventually the NMI watchdog kicks in and creates more havoc. Avoid by telling the NMI watchdog everything is ok. Signed-off-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/kernel/process_64.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index c6f7113b6e2f..1a79d6877981 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -281,6 +281,8 @@ void arch_trigger_all_cpu_backtrace(void) printk(" TPC[%lx] O7[%lx] I7[%lx] RPC[%lx]\n", gp->tpc, gp->o7, gp->i7, gp->rpc); } + + touch_nmi_watchdog(); } memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); @@ -356,6 +358,8 @@ static void pmu_snapshot_all_cpus(void) (cpu == this_cpu ? '*' : ' '), cpu, pp->pcr[0], pp->pcr[1], pp->pcr[2], pp->pcr[3], pp->pic[0], pp->pic[1], pp->pic[2], pp->pic[3]); + + touch_nmi_watchdog(); } memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); From 82a8ba712c44e003903245100db356ef203f78fb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Mar 2015 09:22:10 -0700 Subject: [PATCH 007/236] sparc64: Fix several bugs in memmove(). [ Upstream commit 2077cef4d5c29cf886192ec32066f783d6a80db8 ] Firstly, handle zero length calls properly. Believe it or not there are a few of these happening during early boot. Next, we can't just drop to a memcpy() call in the forward copy case where dst <= src. The reason is that the cache initializing stores used in the Niagara memcpy() implementations can end up clearing out cache lines before we've sourced their original contents completely. For example, considering NG4memcpy, the main unrolled loop begins like this: load src + 0x00 load src + 0x08 load src + 0x10 load src + 0x18 load src + 0x20 store dst + 0x00 Assume dst is 64 byte aligned and let's say that dst is src - 8 for this memcpy() call. That store at the end there is the one to the first line in the cache line, thus clearing the whole line, which thus clobbers "src + 0x28" before it even gets loaded. To avoid this, just fall through to a simple copy only mildly optimized for the case where src and dst are 8 byte aligned and the length is a multiple of 8 as well. We could get fancy and call GENmemcpy() but this is good enough for how this thing is actually used. Reported-by: David Ahern Reported-by: Bob Picco Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/lib/memmove.S | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/sparc/lib/memmove.S b/arch/sparc/lib/memmove.S index b7f6334e159f..857ad4f8905f 100644 --- a/arch/sparc/lib/memmove.S +++ b/arch/sparc/lib/memmove.S @@ -8,9 +8,11 @@ .text ENTRY(memmove) /* o0=dst o1=src o2=len */ - mov %o0, %g1 + brz,pn %o2, 99f + mov %o0, %g1 + cmp %o0, %o1 - bleu,pt %xcc, memcpy + bleu,pt %xcc, 2f add %o1, %o2, %g7 cmp %g7, %o0 bleu,pt %xcc, memcpy @@ -24,7 +26,34 @@ ENTRY(memmove) /* o0=dst o1=src o2=len */ stb %g7, [%o0] bne,pt %icc, 1b sub %o0, 1, %o0 - +99: retl mov %g1, %o0 + + /* We can't just call memcpy for these memmove cases. On some + * chips the memcpy uses cache initializing stores and when dst + * and src are close enough, those can clobber the source data + * before we've loaded it in. + */ +2: or %o0, %o1, %g7 + or %o2, %g7, %g7 + andcc %g7, 0x7, %g0 + bne,pn %xcc, 4f + nop + +3: ldx [%o1], %g7 + add %o1, 8, %o1 + subcc %o2, 8, %o2 + add %o0, 8, %o0 + bne,pt %icc, 3b + stx %g7, [%o0 - 0x8] + ba,a,pt %xcc, 99b + +4: ldub [%o1], %g7 + add %o1, 1, %o1 + subcc %o2, 1, %o2 + add %o0, 1, %o0 + bne,pt %icc, 4b + stb %g7, [%o0 - 0x1] + ba,a,pt %xcc, 99b ENDPROC(memmove) From a1d55b36de6bf21d0d5405c5123d055ff46e8978 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 11 Mar 2015 14:29:17 +0300 Subject: [PATCH 008/236] net: sysctl_net_core: check SNDBUF and RCVBUF for min length [ Upstream commit b1cb59cf2efe7971d3d72a7b963d09a512d994c9 ] sysctl has sysctl.net.core.rmem_*/wmem_* parameters which can be set to incorrect values. Given that 'struct sk_buff' allocates from rcvbuf, incorrectly set buffer length could result to memory allocation failures. For example, set them as follows: # sysctl net.core.rmem_default=64 net.core.wmem_default = 64 # sysctl net.core.wmem_default=64 net.core.wmem_default = 64 # ping localhost -s 1024 -i 0 > /dev/null This could result to the following failure: skbuff: skb_over_panic: text:ffffffff81628db4 len:-32 put:-32 head:ffff88003a1cc200 data:ffff88003a1cc200 tail:0xffffffe0 end:0xc0 dev: kernel BUG at net/core/skbuff.c:102! invalid opcode: 0000 [#1] SMP ... task: ffff88003b7f5550 ti: ffff88003ae88000 task.ti: ffff88003ae88000 RIP: 0010:[] [] skb_put+0xa1/0xb0 RSP: 0018:ffff88003ae8bc68 EFLAGS: 00010296 RAX: 000000000000008d RBX: 00000000ffffffe0 RCX: 0000000000000000 RDX: ffff88003fdcf598 RSI: ffff88003fdcd9c8 RDI: ffff88003fdcd9c8 RBP: ffff88003ae8bc88 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 00000000000002b2 R12: 0000000000000000 R13: 0000000000000000 R14: ffff88003d3f7300 R15: ffff88000012a900 FS: 00007fa0e2b4a840(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000d0f7e0 CR3: 000000003b8fb000 CR4: 00000000000006f0 Stack: ffff88003a1cc200 00000000ffffffe0 00000000000000c0 ffffffff818cab1d ffff88003ae8bd68 ffffffff81628db4 ffff88003ae8bd48 ffff88003b7f5550 ffff880031a09408 ffff88003b7f5550 ffff88000012aa48 ffff88000012ab00 Call Trace: [] unix_stream_sendmsg+0x2c4/0x470 [] sock_write_iter+0x146/0x160 [] new_sync_write+0x92/0xd0 [] vfs_write+0xd6/0x180 [] SyS_write+0x59/0xd0 [] system_call_fastpath+0x12/0x17 Code: 00 00 48 89 44 24 10 8b 87 c8 00 00 00 48 89 44 24 08 48 8b 87 d8 00 00 00 48 c7 c7 30 db 91 81 48 89 04 24 31 c0 e8 4f a8 0e 00 <0f> 0b eb fe 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 RIP [] skb_put+0xa1/0xb0 RSP Kernel panic - not syncing: Fatal exception Moreover, the possible minimum is 1, so we can get another kernel panic: ... BUG: unable to handle kernel paging request at ffff88013caee5c0 IP: [] __alloc_skb+0x12f/0x1f0 ... Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/sysctl_net_core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cf9cd13509a7..e731c96eac4b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,8 @@ static int zero = 0; static int one = 1; static int ushort_max = USHRT_MAX; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, @@ -223,7 +225,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "rmem_max", @@ -231,7 +233,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", @@ -239,7 +241,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "rmem_default", @@ -247,7 +249,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "dev_weight", From 03d7e82103b42e2c3135e6f1f36371c4d21a5ff5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Mar 2015 22:46:59 +0100 Subject: [PATCH 009/236] rds: avoid potential stack overflow [ Upstream commit f862e07cf95d5b62a5fc5e981dd7d0dbaf33a501 ] The rds_iw_update_cm_id function stores a large 'struct rds_sock' object on the stack in order to pass a pair of addresses. This happens to just fit withint the 1024 byte stack size warning limit on x86, but just exceed that limit on ARM, which gives us this warning: net/rds/iw_rdma.c:200:1: warning: the frame size of 1056 bytes is larger than 1024 bytes [-Wframe-larger-than=] As the use of this large variable is basically bogus, we can rearrange the code to not do that. Instead of passing an rds socket into rds_iw_get_device, we now just pass the two addresses that we have available in rds_iw_update_cm_id, and we change rds_iw_get_mr accordingly, to create two address structures on the stack there. Signed-off-by: Arnd Bergmann Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/rds/iw_rdma.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index a817705ce2d0..dba8d0864f18 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -88,7 +88,9 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, int *unpinned); static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) +static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst, + struct rds_iw_device **rds_iwdev, + struct rdma_cm_id **cm_id) { struct rds_iw_device *iwdev; struct rds_iw_cm_id *i_cm_id; @@ -112,15 +114,15 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd src_addr->sin_port, dst_addr->sin_addr.s_addr, dst_addr->sin_port, - rs->rs_bound_addr, - rs->rs_bound_port, - rs->rs_conn_addr, - rs->rs_conn_port); + src->sin_addr.s_addr, + src->sin_port, + dst->sin_addr.s_addr, + dst->sin_port); #ifdef WORKING_TUPLE_DETECTION - if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && - src_addr->sin_port == rs->rs_bound_port && - dst_addr->sin_addr.s_addr == rs->rs_conn_addr && - dst_addr->sin_port == rs->rs_conn_port) { + if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr && + src_addr->sin_port == src->sin_port && + dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr && + dst_addr->sin_port == dst->sin_port) { #else /* FIXME - needs to compare the local and remote * ipaddr/port tuple, but the ipaddr is the only @@ -128,7 +130,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd * zero'ed. It doesn't appear to be properly populated * during connection setup... */ - if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { + if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) { #endif spin_unlock_irq(&iwdev->spinlock); *rds_iwdev = iwdev; @@ -180,19 +182,13 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i { struct sockaddr_in *src_addr, *dst_addr; struct rds_iw_device *rds_iwdev_old; - struct rds_sock rs; struct rdma_cm_id *pcm_id; int rc; src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; - rs.rs_bound_addr = src_addr->sin_addr.s_addr; - rs.rs_bound_port = src_addr->sin_port; - rs.rs_conn_addr = dst_addr->sin_addr.s_addr; - rs.rs_conn_port = dst_addr->sin_port; - - rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); + rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id); if (rc) rds_iw_remove_cm_id(rds_iwdev, cm_id); @@ -598,9 +594,17 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_iw_device *rds_iwdev; struct rds_iw_mr *ibmr = NULL; struct rdma_cm_id *cm_id; + struct sockaddr_in src = { + .sin_addr.s_addr = rs->rs_bound_addr, + .sin_port = rs->rs_bound_port, + }; + struct sockaddr_in dst = { + .sin_addr.s_addr = rs->rs_conn_addr, + .sin_port = rs->rs_conn_port, + }; int ret; - ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); + ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id); if (ret || !cm_id) { ret = -ENODEV; goto out; From 5e1c79561ca426ad8cc757204eff7baf75fa7eb4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Mar 2015 09:49:59 -0700 Subject: [PATCH 010/236] inet_diag: fix possible overflow in inet_diag_dump_one_icsk() [ Upstream commit c8e2c80d7ec00d020320f905822bf49c5ad85250 ] inet_diag_dump_one_icsk() allocates too small skb. Add inet_sk_attr_size() helper right before inet_sk_diag_fill() so that it can be updated if/when new attributes are added. iproute2/ss currently does not use this dump_one() interface, this might explain nobody noticed this problem yet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/inet_diag.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e34dccbc4d70..4eeba4e497a0 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -71,6 +71,20 @@ static inline void inet_diag_unlock_handler( mutex_unlock(&inet_diag_table_mutex); } +static size_t inet_sk_attr_size(void) +{ + return nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct inet_diag_req_v2 *req, struct user_namespace *user_ns, @@ -324,9 +338,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s if (err) goto out; - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - sizeof(struct tcp_info) + 64, GFP_KERNEL); + rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL); if (!rep) { err = -ENOMEM; goto out; From e83f0e31c85cce4d0071aa2acf8852fc0207abd3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 14 Mar 2015 05:22:21 +0000 Subject: [PATCH 011/236] caif: fix MSG_OOB test in caif_seqpkt_recvmsg() [ Upstream commit 3eeff778e00c956875c70b145c52638c313dfb23 ] It should be checking flags, not msg->msg_flags. It's ->sendmsg() instances that need to look for that in ->msg_flags, ->recvmsg() ones (including the other ->recvmsg() instance in that file, as well as unix_dgram_recvmsg() this one claims to be imitating) check in flags. Braino had been introduced in commit dcda13 ("caif: Bugfix - use MSG_TRUNC in receive") back in 2010, so it goes quite a while back. Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/caif/caif_socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index d6be3edb7a43..526bf56f4d31 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -283,7 +283,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, int copylen; ret = -EOPNOTSUPP; - if (m->msg_flags&MSG_OOB) + if (flags & MSG_OOB) goto read_error; skb = skb_recv_datagram(sk, flags, 0 , &ret); From 559a177e5bd74e1621a11fb11a7abb2d4ae38944 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 14 Mar 2015 05:34:56 +0000 Subject: [PATCH 012/236] rxrpc: bogus MSG_PEEK test in rxrpc_recvmsg() [ Upstream commit 7d985ed1dca5c90535d67ce92ef6ca520302340a ] [I would really like an ACK on that one from dhowells; it appears to be quite straightforward, but...] MSG_PEEK isn't passed to ->recvmsg() via msg->msg_flags; as the matter of fact, neither the kernel users of rxrpc, nor the syscalls ever set that bit in there. It gets passed via flags; in fact, another such check in the same function is done correctly - as flags & MSG_PEEK. It had been that way (effectively disabled) for 8 years, though, so the patch needs beating up - that case had never been tested. If it is correct, it's -stable fodder. Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/rxrpc/ar-recvmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 34b5490dde65..4949f753686c 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -87,7 +87,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (!skb) { /* nothing remains on the queue */ if (copied && - (msg->msg_flags & MSG_PEEK || timeo == 0)) + (flags & MSG_PEEK || timeo == 0)) goto out; /* wait for a message to turn up */ From 280683cb38e7faa963e50e58286b8b00e8a8b953 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Wed, 18 Mar 2015 23:01:01 +0100 Subject: [PATCH 013/236] Revert "net: cx82310_eth: use common match macro" [ Upstream commit 8d006e0105978619fb472e150c88b0d49337fe2b ] This reverts commit 11ad714b98f6d9ca0067568442afe3e70eb94845 because it breaks cx82310_eth. The custom USB_DEVICE_CLASS macro matches bDeviceClass, bDeviceSubClass and bDeviceProtocol but the common USB_DEVICE_AND_INTERFACE_INFO matches bInterfaceClass, bInterfaceSubClass and bInterfaceProtocol instead, which are not specified. Signed-off-by: Ondrej Zary Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/cx82310_eth.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/cx82310_eth.c b/drivers/net/usb/cx82310_eth.c index 3eed708a6182..fe48f4c51373 100644 --- a/drivers/net/usb/cx82310_eth.c +++ b/drivers/net/usb/cx82310_eth.c @@ -300,9 +300,18 @@ static const struct driver_info cx82310_info = { .tx_fixup = cx82310_tx_fixup, }; +#define USB_DEVICE_CLASS(vend, prod, cl, sc, pr) \ + .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \ + USB_DEVICE_ID_MATCH_DEV_INFO, \ + .idVendor = (vend), \ + .idProduct = (prod), \ + .bDeviceClass = (cl), \ + .bDeviceSubClass = (sc), \ + .bDeviceProtocol = (pr) + static const struct usb_device_id products[] = { { - USB_DEVICE_AND_INTERFACE_INFO(0x0572, 0xcb01, 0xff, 0, 0), + USB_DEVICE_CLASS(0x0572, 0xcb01, 0xff, 0, 0), .driver_info = (unsigned long) &cx82310_info }, { }, From b7460a1fe397d5bc434f5f8c3b8bac61f4cec6a1 Mon Sep 17 00:00:00 2001 From: Steven Barth Date: Thu, 19 Mar 2015 16:16:04 +0100 Subject: [PATCH 014/236] ipv6: fix backtracking for throw routes [ Upstream commit 73ba57bfae4a1914f6a6dac71e3168dd900e00af ] for throw routes to trigger evaluation of other policy rules EAGAIN needs to be propagated up to fib_rules_lookup similar to how its done for IPv4 A simple testcase for verification is: ip -6 rule add lookup 33333 priority 33333 ip -6 route add throw 2001:db8::1 ip -6 route add 2001:db8::1 via fe80::1 dev wlan0 table 33333 ip route get 2001:db8::1 Signed-off-by: Steven Barth Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/fib6_rules.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index b4d5e1d97c1b..27ca79682efb 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -104,6 +104,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto again; flp6->saddr = saddr; } + err = rt->dst.error; goto out; } again: From be41c4fc268e06af8be72ec598bef7ab3f155208 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Thu, 19 Mar 2015 19:19:30 -0400 Subject: [PATCH 015/236] tcp: fix tcp fin memory accounting [ Upstream commit d22e1537181188e5dc8cbc51451832625035bdc2 ] tcp_send_fin() does not account for the memory it allocates properly, so sk_forward_alloc can be negative in cases where we've sent a FIN: ss example output (ss -amn | grep -B1 f4294): tcp FIN-WAIT-1 0 1 192.168.0.1:45520 192.0.2.1:8080 skmem:(r0,rb87380,t0,tb87380,f4294966016,w1280,o0,bl0) Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7efa26bb872c..acd3ef51a4d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2617,15 +2617,11 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb_fclone(MAX_TCP_HEADER, - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (skb) break; yield(); } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_TCP_HEADER); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); From b16f59e616f0377c3f7f54630a1d8a631b054282 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 20 Mar 2015 16:48:13 +0000 Subject: [PATCH 016/236] net: compat: Update get_compat_msghdr() to match copy_msghdr_from_user() behaviour [ Upstream commit 91edd096e224941131f896b86838b1e59553696a ] Commit db31c55a6fb2 (net: clamp ->msg_namelen instead of returning an error) introduced the clamping of msg_namelen when the unsigned value was larger than sizeof(struct sockaddr_storage). This caused a msg_namelen of -1 to be valid. The native code was subsequently fixed by commit dbb490b96584 (net: socket: error on a negative msg_namelen). In addition, the native code sets msg_namelen to 0 when msg_name is NULL. This was done in commit (6a2a2b3ae075 net:socket: set msg_namelen to 0 if msg_name is passed as NULL in msghdr struct from userland) and subsequently updated by 08adb7dabd48 (fold verify_iovec() into copy_msghdr_from_user()). This patch brings the get_compat_msghdr() in line with copy_msghdr_from_user(). Fixes: db31c55a6fb2 (net: clamp ->msg_namelen instead of returning an error) Cc: David S. Miller Cc: Dan Carpenter Signed-off-by: Catalin Marinas Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/compat.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/compat.c b/net/compat.c index 275af79c131b..d12529050b29 100644 --- a/net/compat.c +++ b/net/compat.c @@ -71,6 +71,13 @@ int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg) __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || __get_user(kmsg->msg_flags, &umsg->msg_flags)) return -EFAULT; + + if (!tmp1) + kmsg->msg_namelen = 0; + + if (kmsg->msg_namelen < 0) + return -EINVAL; + if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); kmsg->msg_name = compat_ptr(tmp1); From 5a8e8f482b4a0a058cf886d7da5be6827959d100 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Nov 2014 23:06:20 -0800 Subject: [PATCH 017/236] tcp: make connect() mem charging friendly [ Upstream commit 355a901e6cf1b2b763ec85caa2a9f04fbcc4ab4a ] While working on sk_forward_alloc problems reported by Denys Fedoryshchenko, we found that tcp connect() (and fastopen) do not call sk_wmem_schedule() for SYN packet (and/or SYN/DATA packet), so sk_forward_alloc is negative while connect is in progress. We can fix this by calling regular sk_stream_alloc_skb() both for the SYN packet (in tcp_connect()) and the syn_data packet in tcp_send_syn_data() Then, tcp_send_syn_data() can avoid copying syn_data as we simply can manipulate syn_data->cb[] to remove SYN flag (and increment seq) Instead of open coding memcpy_fromiovecend(), simply use this helper. This leaves in socket write queue clean fast clone skbs. This was tested against our fastopen packetdrill tests. Reported-by: Denys Fedoryshchenko Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 64 +++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index acd3ef51a4d1..d0c310801479 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2895,9 +2895,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; - struct sk_buff *syn_data = NULL, *data; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; + struct sk_buff *syn_data; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, @@ -2928,42 +2928,38 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, - sk->sk_allocation); - if (syn_data == NULL) + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + if (!syn_data) + goto fallback; + syn_data->ip_summed = CHECKSUM_PARTIAL; + memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); + if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), + fo->data->msg_iov, 0, space))) { + kfree_skb(syn_data); goto fallback; - - for (i = 0; i < iovlen && syn_data->len < space; ++i) { - struct iovec *iov = &fo->data->msg_iov[i]; - unsigned char __user *from = iov->iov_base; - int len = iov->iov_len; - - if (syn_data->len + len > space) - len = space - syn_data->len; - else if (i + 1 == iovlen) - /* No more data pending in inet_wait_for_connect() */ - fo->data = NULL; - - if (skb_add_data(syn_data, from, len)) - goto fallback; } - /* Queue a data-only packet after the regular SYN for retransmission */ - data = pskb_copy(syn_data, sk->sk_allocation); - if (data == NULL) - goto fallback; - TCP_SKB_CB(data)->seq++; - TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; - TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); - tcp_connect_queue_skb(sk, data); - fo->copied = data->len; + /* No more data pending in inet_wait_for_connect() */ + if (space == fo->size) + fo->data = NULL; + fo->copied = space; - if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + tcp_connect_queue_skb(sk, syn_data); + + err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); + + /* Now full SYN+DATA was cloned and sent (or not), + * remove the SYN from the original skb (syn_data) + * we keep in write queue in case of a retransmit, as we + * also have the SYN packet (with no data) in the same queue. + */ + TCP_SKB_CB(syn_data)->seq++; + TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; + if (!err) { tp->syn_data = (fo->copied > 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); goto done; } - syn_data = NULL; fallback: /* Send a regular SYN with Fast Open cookie request option */ @@ -2972,7 +2968,6 @@ fallback: err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; - kfree_skb(syn_data); done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; @@ -2992,13 +2987,10 @@ int tcp_connect(struct sock *sk) return 0; } - buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); - if (unlikely(buff == NULL)) + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + if (unlikely(!buff)) return -ENOBUFS; - /* Reserve space for headers. */ - skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); From 896bda003c628a5ca26b8f878d1f2e87ce132fae Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Mon, 2 Mar 2015 21:36:07 +0200 Subject: [PATCH 018/236] drm/radeon: fix DRM_IOCTL_RADEON_CS oops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a28b2a47edcd0cb7c051b445f71a426000394606 upstream. Passing zeroed drm_radeon_cs struct to DRM_IOCTL_RADEON_CS produces the following oops. Fix by always calling INIT_LIST_HEAD() to avoid the crash in list_sort(). ---------------------------------- #include #include #include #include #include static const struct drm_radeon_cs cs; int main(int argc, char **argv) { return ioctl(open(argv[1], O_RDWR), DRM_IOCTL_RADEON_CS, &cs); } ---------------------------------- [ttrantal@test2 ~]$ ./main /dev/dri/card0 [ 46.904650] BUG: unable to handle kernel NULL pointer dereference at (null) [ 46.905022] IP: [] list_sort+0x42/0x240 [ 46.905022] PGD 68f29067 PUD 688b5067 PMD 0 [ 46.905022] Oops: 0002 [#1] SMP [ 46.905022] CPU: 0 PID: 2413 Comm: main Not tainted 4.0.0-rc1+ #58 [ 46.905022] Hardware name: Hewlett-Packard HP Compaq dc5750 Small Form Factor/0A64h, BIOS 786E3 v02.10 01/25/2007 [ 46.905022] task: ffff880058e2bcc0 ti: ffff880058e64000 task.ti: ffff880058e64000 [ 46.905022] RIP: 0010:[] [] list_sort+0x42/0x240 [ 46.905022] RSP: 0018:ffff880058e67998 EFLAGS: 00010246 [ 46.905022] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 46.905022] RDX: ffffffff81644410 RSI: ffff880058e67b40 RDI: ffff880058e67a58 [ 46.905022] RBP: ffff880058e67a88 R08: 0000000000000000 R09: 0000000000000000 [ 46.905022] R10: ffff880058e2bcc0 R11: ffffffff828e6ca0 R12: ffffffff81644410 [ 46.905022] R13: ffff8800694b8018 R14: 0000000000000000 R15: ffff880058e679b0 [ 46.905022] FS: 00007fdc65a65700(0000) GS:ffff88006d600000(0000) knlGS:0000000000000000 [ 46.905022] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 46.905022] CR2: 0000000000000000 CR3: 0000000058dd9000 CR4: 00000000000006f0 [ 46.905022] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 46.905022] DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400 [ 46.905022] Stack: [ 46.905022] ffff880058e67b40 ffff880058e2bcc0 ffff880058e67a78 0000000000000000 [ 46.905022] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 46.905022] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 46.905022] Call Trace: [ 46.905022] [] radeon_cs_parser_fini+0x195/0x220 [ 46.905022] [] radeon_cs_ioctl+0xa9/0x960 [ 46.905022] [] drm_ioctl+0x19c/0x640 [ 46.905022] [] ? trace_hardirqs_on_caller+0xfd/0x1c0 [ 46.905022] [] ? trace_hardirqs_on+0xd/0x10 [ 46.905022] [] radeon_drm_ioctl+0x46/0x80 [ 46.905022] [] do_vfs_ioctl+0x318/0x570 [ 46.905022] [] ? selinux_file_ioctl+0x56/0x110 [ 46.905022] [] SyS_ioctl+0x81/0xa0 [ 46.905022] [] system_call_fastpath+0x12/0x17 [ 46.905022] Code: 48 89 b5 10 ff ff ff 0f 84 03 01 00 00 4c 8d bd 28 ff ff ff 31 c0 48 89 fb b9 15 00 00 00 49 89 d4 4c 89 ff f3 48 ab 48 8b 46 08 <48> c7 00 00 00 00 00 48 8b 0e 48 85 c9 0f 84 7d 00 00 00 c7 85 [ 46.905022] RIP [] list_sort+0x42/0x240 [ 46.905022] RSP [ 46.905022] CR2: 0000000000000000 [ 47.149253] ---[ end trace 09576b4e8b2c20b8 ]--- Reviewed-by: Christian König Signed-off-by: Tommi Rantala Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_cs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 7f2d6c0d11c1..2f2d2ce34709 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -179,11 +179,13 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data) u32 ring = RADEON_CS_RING_GFX; s32 priority = 0; + INIT_LIST_HEAD(&p->validated); + if (!cs->num_chunks) { return 0; } + /* get chunks */ - INIT_LIST_HEAD(&p->validated); p->idx = 0; p->ib.sa_bo = NULL; p->ib.semaphore = NULL; From 336163cf1efa4ecb05ce33d64e4b32c93aa10e04 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Mar 2015 20:42:53 -0500 Subject: [PATCH 019/236] drm/radeon: do a posting read in evergreen_set_irq commit c320bb5f6dc0cb88a811cbaf839303e0a3916a92 upstream. To make sure the writes go through the pci bridge. bug: https://bugzilla.kernel.org/show_bug.cgi?id=90741 Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/evergreen.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index 7138f3e31b7c..ee9f0b4e90d6 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -4596,6 +4596,9 @@ int evergreen_irq_set(struct radeon_device *rdev) WREG32(AFMT_AUDIO_PACKET_CONTROL + EVERGREEN_CRTC4_REGISTER_OFFSET, afmt5); WREG32(AFMT_AUDIO_PACKET_CONTROL + EVERGREEN_CRTC5_REGISTER_OFFSET, afmt6); + /* posting read */ + RREG32(SRBM_STATUS); + return 0; } From 36358b3f64dab4e9372a520901a7636cf6e01a1e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Mar 2015 20:36:26 -0500 Subject: [PATCH 020/236] drm/radeon: do a posting read in r100_set_irq commit f957063fee6392bb9365370db6db74dc0b2dce0a upstream. To make sure the writes go through the pci bridge. bug: https://bugzilla.kernel.org/show_bug.cgi?id=90741 Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/r100.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 07620e198a6d..e28de20d469a 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -742,6 +742,10 @@ int r100_irq_set(struct radeon_device *rdev) tmp |= RADEON_FP2_DETECT_MASK; } WREG32(RADEON_GEN_INT_CNTL, tmp); + + /* read back to post the write */ + RREG32(RADEON_GEN_INT_CNTL); + return 0; } From 82e7798ff40b5b9a6723092c4314c23a49362115 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Mar 2015 20:41:31 -0500 Subject: [PATCH 021/236] drm/radeon: do a posting read in r600_set_irq commit 9d1393f23d5656cdd5f368efd60694d4aeed81d3 upstream. To make sure the writes go through the pci bridge. bug: https://bugzilla.kernel.org/show_bug.cgi?id=90741 Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/r600.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 788f602e8989..74a8a38db4d5 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -3647,6 +3647,9 @@ int r600_irq_set(struct radeon_device *rdev) WREG32(RV770_CG_THERMAL_INT, thermal_int); } + /* posting read */ + RREG32(R_000E50_SRBM_STATUS); + return 0; } From 028e9163a3b794a67c21d8a1519a4a1ad80bec99 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Mar 2015 20:45:24 -0500 Subject: [PATCH 022/236] drm/radeon: do a posting read in cik_set_irq commit cffefd9bb31cd35ab745d3b49005d10616d25bdc upstream. To make sure the writes go through the pci bridge. bug: https://bugzilla.kernel.org/show_bug.cgi?id=90741 Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/cik.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index f0ed0baddf70..c3664bc05acf 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -7069,6 +7069,9 @@ int cik_irq_set(struct radeon_device *rdev) WREG32(DC_HPD5_INT_CONTROL, hpd5); WREG32(DC_HPD6_INT_CONTROL, hpd6); + /* posting read */ + RREG32(SRBM_STATUS); + return 0; } From 85c5b0d3c4c9a9a724d058a4f8fc53e45e142c0a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Mar 2015 20:43:53 -0500 Subject: [PATCH 023/236] drm/radeon: do a posting read in si_set_irq commit 0586915ec10d0ae60de5cd3381ad25a704760402 upstream. To make sure the writes go through the pci bridge. bug: https://bugzilla.kernel.org/show_bug.cgi?id=90741 Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/si.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 52b64ad285d6..d21247e9fd10 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -5958,6 +5958,9 @@ int si_irq_set(struct radeon_device *rdev) WREG32(CG_THERMAL_INT, thermal_int); + /* posting read */ + RREG32(SRBM_STATUS); + return 0; } From bb0677edcd41fe506e02b48a3df49cd9a6048507 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Mar 2015 20:39:56 -0500 Subject: [PATCH 024/236] drm/radeon: do a posting read in rs600_set_irq commit 54acf107e4e66d1f4a697e08a7f60dba9fcf07c3 upstream. To make sure the writes go through the pci bridge. bug: https://bugzilla.kernel.org/show_bug.cgi?id=90741 Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/rs600.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c index e5619d5e2a30..4261b3865cb0 100644 --- a/drivers/gpu/drm/radeon/rs600.c +++ b/drivers/gpu/drm/radeon/rs600.c @@ -700,6 +700,10 @@ int rs600_irq_set(struct radeon_device *rdev) WREG32(R_007D18_DC_HOT_PLUG_DETECT2_INT_CONTROL, hpd2); if (ASIC_IS_DCE2(rdev)) WREG32(R_007408_HDMI0_AUDIO_PACKET_CONTROL, hdmi0); + + /* posting read */ + RREG32(R_000040_GEN_INT_CNTL); + return 0; } From b2e7ca9055d8c3733b5f41d9cce5a761fb53f3d0 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 3 Mar 2015 17:00:43 -0500 Subject: [PATCH 025/236] drm/radeon: fix interlaced modes on DCE8 commit 77ae5f4b48a0445426c9c1ef7c0f28b717e35d55 upstream. Need to double the viewport height. Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/atombios_crtc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c index 0cca5f24196a..663394f0c166 100644 --- a/drivers/gpu/drm/radeon/atombios_crtc.c +++ b/drivers/gpu/drm/radeon/atombios_crtc.c @@ -1306,6 +1306,9 @@ static int dce4_crtc_do_set_base(struct drm_crtc *crtc, (x << 16) | y); viewport_w = crtc->mode.hdisplay; viewport_h = (crtc->mode.vdisplay + 1) & ~1; + if ((rdev->family >= CHIP_BONAIRE) && + (crtc->mode.flags & DRM_MODE_FLAG_INTERLACE)) + viewport_h *= 2; WREG32(EVERGREEN_VIEWPORT_SIZE + radeon_crtc->crtc_offset, (viewport_w << 16) | viewport_h); From 12a95096f440beead7b10fb3335890114bdbf67f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 19 Feb 2015 09:40:28 +0100 Subject: [PATCH 026/236] drm/radeon: drop setting UPLL to sleep mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a17d4996e051e78d164989b894608cf37cd5110b upstream. Just keep it working, seems to fix some PLL problems. Bug: https://bugs.freedesktop.org/show_bug.cgi?id=73378 Signed-off-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/si.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index d21247e9fd10..2f2deccb3b78 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -6878,8 +6878,7 @@ int si_set_uvd_clocks(struct radeon_device *rdev, u32 vclk, u32 dclk) WREG32_P(CG_UPLL_FUNC_CNTL, UPLL_BYPASS_EN_MASK, ~UPLL_BYPASS_EN_MASK); if (!vclk || !dclk) { - /* keep the Bypass mode, put PLL to sleep */ - WREG32_P(CG_UPLL_FUNC_CNTL, UPLL_SLEEP_MASK, ~UPLL_SLEEP_MASK); + /* keep the Bypass mode */ return 0; } @@ -6895,8 +6894,7 @@ int si_set_uvd_clocks(struct radeon_device *rdev, u32 vclk, u32 dclk) /* set VCO_MODE to 1 */ WREG32_P(CG_UPLL_FUNC_CNTL, UPLL_VCO_MODE_MASK, ~UPLL_VCO_MODE_MASK); - /* toggle UPLL_SLEEP to 1 then back to 0 */ - WREG32_P(CG_UPLL_FUNC_CNTL, UPLL_SLEEP_MASK, ~UPLL_SLEEP_MASK); + /* disable sleep mode */ WREG32_P(CG_UPLL_FUNC_CNTL, 0, ~UPLL_SLEEP_MASK); /* deassert UPLL_RESET */ From d34369c74089215e0eb4d18697fd69591f4f2cdb Mon Sep 17 00:00:00 2001 From: JeHyeon Yeon Date: Mon, 16 Mar 2015 01:03:19 +0000 Subject: [PATCH 027/236] LZ4 : fix the data abort issue commit d5e7cafd69da24e6d6cc988fab6ea313a2577efc upstream. If the part of the compression data are corrupted, or the compression data is totally fake, the memory access over the limit is possible. This is the log from my system usning lz4 decompression. [6502]data abort, halting [6503]r0 0x00000000 r1 0x00000000 r2 0xdcea0ffc r3 0xdcea0ffc [6509]r4 0xb9ab0bfd r5 0xdcea0ffc r6 0xdcea0ff8 r7 0xdce80000 [6515]r8 0x00000000 r9 0x00000000 r10 0x00000000 r11 0xb9a98000 [6522]r12 0xdcea1000 usp 0x00000000 ulr 0x00000000 pc 0x820149bc [6528]spsr 0x400001f3 and the memory addresses of some variables at the moment are ref:0xdcea0ffc, op:0xdcea0ffc, oend:0xdcea1000 As you can see, COPYLENGH is 8bytes, so @ref and @op can access the momory over @oend. Signed-off-by: JeHyeon Yeon Reviewed-by: David Sterba Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- lib/lz4/lz4_decompress.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 7a85967060a5..f0f5c5c3de12 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -139,6 +139,9 @@ static int lz4_uncompress(const char *source, char *dest, int osize) /* Error: request to write beyond destination buffer */ if (cpy > oend) goto _output_error; + if ((ref + COPYLENGTH) > oend || + (op + COPYLENGTH) > oend) + goto _output_error; LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); while (op < cpy) *op++ = *ref++; From 343a1ba48be881067a96fb22fb426dfa8c8db10a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 26 Feb 2015 11:45:47 +0100 Subject: [PATCH 028/236] fuse: set stolen page uptodate commit aa991b3b267e24f578bac7b09cc57579b660304b upstream. Regular pipe buffers' ->steal method (generic_pipe_buf_steal()) doesn't set PG_uptodate. Don't warn on this condition, just set the uptodate flag. Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6eb13c621a14..5d4b2cc28936 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -819,8 +819,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newpage = buf->page; - if (WARN_ON(!PageUptodate(newpage))) - return -EIO; + if (!PageUptodate(newpage)) + SetPageUptodate(newpage); ClearPageMappedToDisk(newpage); From 07bb18dc02092b45e8ea508adf899c35cc181b65 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 26 Feb 2015 11:45:47 +0100 Subject: [PATCH 029/236] fuse: notify: don't move pages commit 0d2783626a53d4c922f82d51fa675cb5d13f0d36 upstream. fuse_try_move_page() is not prepared for replacing pages that have already been read. Reported-by: Al Viro Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5d4b2cc28936..499155ca3e84 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1726,6 +1726,9 @@ copy_finish: static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { + /* Don't try to move pages (yet) */ + cs->move_pages = 0; + switch (code) { case FUSE_NOTIFY_POLL: return fuse_notify_poll(fc, size, cs); From c04885d5db4c67e086cbbc3cbafe3f8902ff9dae Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 1 Mar 2015 10:11:05 -0500 Subject: [PATCH 030/236] console: Fix console name size mismatch commit 30a22c215a0007603ffc08021f2e8b64018517dd upstream. commit 6ae9200f2cab7 ("enlarge console.name") increased the storage for the console name to 16 bytes, but not the corresponding struct console_cmdline::name storage. Console names longer than 8 bytes cause read beyond end-of-string and failure to match console; I'm not sure if there are other unexpected consequences. Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- kernel/printk/console_cmdline.h | 2 +- kernel/printk/printk.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index cbd69d842341..2ca4a8b5fe57 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -3,7 +3,7 @@ struct console_cmdline { - char name[8]; /* Name of the driver */ + char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8c086e6049b9..a755ad70fe8f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2280,6 +2280,7 @@ void register_console(struct console *newcon) for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && From 8b0c4ab4edfb6b8f4d04ab662a116e90de27695a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 5 Mar 2015 10:45:30 +1030 Subject: [PATCH 031/236] virtio_console: init work unconditionally commit 4f6e24ed9de8634d6471ef86b382cba6d4e57ca8 upstream. when multiport is off, we don't initialize config work, but we then cancel uninitialized control_work on freeze. Signed-off-by: Michael S. Tsirkin Reviewed-by: Amit Shah Signed-off-by: Rusty Russell Signed-off-by: Greg Kroah-Hartman --- drivers/char/virtio_console.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 6928d094451d..063b2d690187 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -2024,12 +2024,13 @@ static int virtcons_probe(struct virtio_device *vdev) spin_lock_init(&portdev->ports_lock); INIT_LIST_HEAD(&portdev->ports); + INIT_WORK(&portdev->control_work, &control_work_handler); + if (multiport) { unsigned int nr_added_bufs; spin_lock_init(&portdev->c_ivq_lock); spin_lock_init(&portdev->c_ovq_lock); - INIT_WORK(&portdev->control_work, &control_work_handler); nr_added_bufs = fill_queue(portdev->c_ivq, &portdev->c_ivq_lock); From 59cdf18af738eee76a68d0c3557855465e2977a5 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 5 Mar 2015 10:45:49 +1030 Subject: [PATCH 032/236] virtio_console: avoid config access from irq commit eeb8a7e8bb123e84daeef84f5a2eab99ad2839a2 upstream. when multiport is off, virtio console invokes config access from irq context, config access is blocking on s390. Fix this up by scheduling work from config irq - similar to what we do for multiport configs. Signed-off-by: Michael S. Tsirkin Reviewed-by: Amit Shah Signed-off-by: Rusty Russell Signed-off-by: Greg Kroah-Hartman --- drivers/char/virtio_console.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 063b2d690187..b08eadb4b1d2 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -142,6 +142,7 @@ struct ports_device { * notification */ struct work_struct control_work; + struct work_struct config_work; struct list_head ports; @@ -1832,10 +1833,21 @@ static void config_intr(struct virtio_device *vdev) portdev = vdev->priv; + if (!use_multiport(portdev)) + schedule_work(&portdev->config_work); +} + +static void config_work_handler(struct work_struct *work) +{ + struct ports_device *portdev; + + portdev = container_of(work, struct ports_device, control_work); if (!use_multiport(portdev)) { + struct virtio_device *vdev; struct port *port; u16 rows, cols; + vdev = portdev->vdev; virtio_cread(vdev, struct virtio_console_config, cols, &cols); virtio_cread(vdev, struct virtio_console_config, rows, &rows); @@ -2024,6 +2036,7 @@ static int virtcons_probe(struct virtio_device *vdev) spin_lock_init(&portdev->ports_lock); INIT_LIST_HEAD(&portdev->ports); + INIT_WORK(&portdev->config_work, &config_work_handler); INIT_WORK(&portdev->control_work, &control_work_handler); if (multiport) { @@ -2098,6 +2111,8 @@ static void virtcons_remove(struct virtio_device *vdev) /* Finish up work that's lined up */ if (use_multiport(portdev)) cancel_work_sync(&portdev->control_work); + else + cancel_work_sync(&portdev->config_work); list_for_each_entry_safe(port, port2, &portdev->ports, list) unplug_port(port); @@ -2149,6 +2164,7 @@ static int virtcons_freeze(struct virtio_device *vdev) virtqueue_disable_cb(portdev->c_ivq); cancel_work_sync(&portdev->control_work); + cancel_work_sync(&portdev->config_work); /* * Once more: if control_work_handler() was running, it would * enable the cb as the last step. From 49f3df5bdeb22ca0258b9ce90365aeb0e2ba041e Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 6 Mar 2015 10:49:21 +0000 Subject: [PATCH 033/236] Change email address for 8250_pci commit f2e0ea861117bda073d1d7ffbd3120c07c0d5d34 upstream. I'm still receiving reports to my email address, so let's point this at the linux-serial mailing list instead. Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index feda34404ed0..5892eab03874 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -66,7 +66,7 @@ static void moan_device(const char *str, struct pci_dev *dev) "Please send the output of lspci -vv, this\n" "message (0x%04x,0x%04x,0x%04x,0x%04x), the\n" "manufacturer and name of serial board or\n" - "modem board to rmk+serial@arm.linux.org.uk.\n", + "modem board to .\n", pci_name(dev), str, dev->vendor, dev->device, dev->subsystem_vendor, dev->subsystem_device); } From 3cc97c83e2051584120819e40e7b0ce4dadf0925 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 23 Feb 2015 20:37:54 +0100 Subject: [PATCH 034/236] can: add missing initialisations in CAN related skbuffs commit 969439016d2cf61fef53a973d7e6d2061c3793b1 upstream. When accessing CAN network interfaces with AF_PACKET sockets e.g. by dhclient this can lead to a skb_under_panic due to missing skb initialisations. Add the missing initialisations at the CAN skbuff creation times on driver level (rx path) and in the network layer (tx path). Reported-by: Austin Schuh Reported-by: Daniel Steer Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/dev.c | 8 ++++++++ net/can/af_can.c | 3 +++ 2 files changed, 11 insertions(+) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 1468c4658804..84ad2b44377c 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -502,6 +502,14 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; diff --git a/net/can/af_can.c b/net/can/af_can.c index a27f8aad9e99..5e9a2272b7a7 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -262,6 +262,9 @@ int can_send(struct sk_buff *skb, int loop) goto inval_skb; } + skb->ip_summed = CHECKSUM_UNNECESSARY; + + skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); From 28a4e9536385dcb35c242c464c19ae0dca76ef78 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Mar 2015 08:04:13 -0500 Subject: [PATCH 035/236] workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE commit 8603e1b30027f943cc9c1eef2b291d42c3347af1 upstream. cancel[_delayed]_work_sync() are implemented using __cancel_work_timer() which grabs the PENDING bit using try_to_grab_pending() and then flushes the work item with PENDING set to prevent the on-going execution of the work item from requeueing itself. try_to_grab_pending() can always grab PENDING bit without blocking except when someone else is doing the above flushing during cancelation. In that case, try_to_grab_pending() returns -ENOENT. In this case, __cancel_work_timer() currently invokes flush_work(). The assumption is that the completion of the work item is what the other canceling task would be waiting for too and thus waiting for the same condition and retrying should allow forward progress without excessive busy looping Unfortunately, this doesn't work if preemption is disabled or the latter task has real time priority. Let's say task A just got woken up from flush_work() by the completion of the target work item. If, before task A starts executing, task B gets scheduled and invokes __cancel_work_timer() on the same work item, its try_to_grab_pending() will return -ENOENT as the work item is still being canceled by task A and flush_work() will also immediately return false as the work item is no longer executing. This puts task B in a busy loop possibly preventing task A from executing and clearing the canceling state on the work item leading to a hang. task A task B worker executing work __cancel_work_timer() try_to_grab_pending() set work CANCELING flush_work() block for work completion completion, wakes up A __cancel_work_timer() while (forever) { try_to_grab_pending() -ENOENT as work is being canceled flush_work() false as work is no longer executing } This patch removes the possible hang by updating __cancel_work_timer() to explicitly wait for clearing of CANCELING rather than invoking flush_work() after try_to_grab_pending() fails with -ENOENT. Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com v3: bit_waitqueue() can't be used for work items defined in vmalloc area. Switched to custom wake function which matches the target work item and exclusive wait and wakeup. v2: v1 used wake_up() on bit_waitqueue() which leads to NULL deref if the target bit waitqueue has wait_bit_queue's on it. Use DEFINE_WAIT_BIT() and __wake_up_bit() instead. Reported by Tomeu Vizoso. Signed-off-by: Tejun Heo Reported-by: Rabin Vincent Cc: Tomeu Vizoso Tested-by: Jesper Nilsson Tested-by: Rabin Vincent Signed-off-by: Greg Kroah-Hartman --- include/linux/workqueue.h | 3 ++- kernel/workqueue.c | 56 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 6c62cfa25f1a..bc9d2c2ddf87 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -71,7 +71,8 @@ enum { /* data contains off-queue information when !WORK_STRUCT_PWQ */ WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, - WORK_OFFQ_CANCELING = (1 << WORK_OFFQ_FLAG_BASE), + __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, + WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), /* * When a work item is off queue, its high bits point to the last diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f6f31d823e8e..423c9e37a9e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2893,19 +2893,57 @@ bool flush_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work); +struct cwt_wait { + wait_queue_t wait; + struct work_struct *work; +}; + +static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); + + if (cwait->work != key) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); /* - * If someone else is canceling, wait for the same event it - * would be waiting for before retrying. + * If someone else is already canceling, wait for it to + * finish. flush_work() doesn't work for PREEMPT_NONE + * because we may get scheduled between @work's completion + * and the other canceling task resuming and clearing + * CANCELING - flush_work() will return false immediately + * as @work is no longer busy, try_to_grab_pending() will + * return -ENOENT as @work is still being canceled and the + * other canceling task won't be able to clear CANCELING as + * we're hogging the CPU. + * + * Let's wait for completion using a waitqueue. As this + * may lead to the thundering herd problem, use a custom + * wake function which matches @work along with exclusive + * wait and wakeup. */ - if (unlikely(ret == -ENOENT)) - flush_work(work); + if (unlikely(ret == -ENOENT)) { + struct cwt_wait cwait; + + init_wait(&cwait.wait); + cwait.wait.func = cwt_wakefn; + cwait.work = work; + + prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, + TASK_UNINTERRUPTIBLE); + if (work_is_canceling(work)) + schedule(); + finish_wait(&cancel_waitq, &cwait.wait); + } } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ @@ -2914,6 +2952,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) flush_work(work); clear_work_data(work); + + /* + * Paired with prepare_to_wait() above so that either + * waitqueue_active() is visible here or !work_is_canceling() is + * visible there. + */ + smp_mb(); + if (waitqueue_active(&cancel_waitq)) + __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); + return ret; } From 0588215aed7f4a280ef87f87eca0862222cc0b7b Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 13 Feb 2015 11:58:07 +0800 Subject: [PATCH 036/236] cpuset: Fix cpuset sched_relax_domain_level commit 283cb41f426b723a0255702b761b0fc5d1b53a81 upstream. The cpuset.sched_relax_domain_level can control how far we do immediate load balancing on a system. However, it was found on recent kernels that echo'ing a value into cpuset.sched_relax_domain_level did not reduce any immediate load balancing. The reason this occurred was because the update_domain_attr_tree() traversal did not update for the "top_cpuset". This resulted in nothing being changed when modifying the sched_relax_domain_level parameter. This patch is able to address that problem by having update_domain_attr_tree() allow updates for the root in the cpuset traversal. Fixes: fc560a26acce ("cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()") Signed-off-by: Jason Low Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Tested-by: Serge Hallyn Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2fb2877e6961..7b4530b0b16b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -503,9 +503,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) - continue; - /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); From f49920b9a288d5ddfc7e50d44d380ac4f60862bf Mon Sep 17 00:00:00 2001 From: "jmlatten@linux.vnet.ibm.com" Date: Fri, 20 Feb 2015 18:11:24 -0600 Subject: [PATCH 037/236] tpm/ibmvtpm: Additional LE support for tpm_ibmvtpm_send commit 62dfd912ab3b5405b6fe72d0135c37e9648071f1 upstream. Problem: When IMA and VTPM are both enabled in kernel config, kernel hangs during bootup on LE OS. Why?: IMA calls tpm_pcr_read() which results in tpm_ibmvtpm_send and tpm_ibmtpm_recv getting called. A trace showed that tpm_ibmtpm_recv was hanging. Resolution: tpm_ibmtpm_recv was hanging because tpm_ibmvtpm_send was sending CRQ message that probably did not make much sense to phype because of Endianness. The fix below sends correctly converted CRQ for LE. This was not caught before because it seems IMA is not enabled by default in kernel config and IMA exercises this particular code path in vtpm. Tested with IMA and VTPM enabled in kernel config and VTPM enabled on both a BE OS and a LE OS ppc64 lpar. This exercised CRQ and TPM command code paths in vtpm. Patch is against Peter's tpmdd tree on github which included Vicky's previous vtpm le patches. Signed-off-by: Joy Latten Reviewed-by: Ashley Lai Signed-off-by: Peter Huewe Signed-off-by: Greg Kroah-Hartman --- drivers/char/tpm/tpm_ibmvtpm.c | 10 +++++----- drivers/char/tpm/tpm_ibmvtpm.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index eff9d5870034..102463ba745d 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -124,7 +124,7 @@ static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) { struct ibmvtpm_dev *ibmvtpm; struct ibmvtpm_crq crq; - u64 *word = (u64 *) &crq; + __be64 *word = (__be64 *)&crq; int rc; ibmvtpm = (struct ibmvtpm_dev *)TPM_VPRIV(chip); @@ -145,11 +145,11 @@ static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) memcpy((void *)ibmvtpm->rtce_buf, (void *)buf, count); crq.valid = (u8)IBMVTPM_VALID_CMD; crq.msg = (u8)VTPM_TPM_COMMAND; - crq.len = (u16)count; - crq.data = ibmvtpm->rtce_dma_handle; + crq.len = cpu_to_be16(count); + crq.data = cpu_to_be32(ibmvtpm->rtce_dma_handle); - rc = ibmvtpm_send_crq(ibmvtpm->vdev, cpu_to_be64(word[0]), - cpu_to_be64(word[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, be64_to_cpu(word[0]), + be64_to_cpu(word[1])); if (rc != H_SUCCESS) { dev_err(ibmvtpm->dev, "tpm_ibmvtpm_send failed rc=%d\n", rc); rc = 0; diff --git a/drivers/char/tpm/tpm_ibmvtpm.h b/drivers/char/tpm/tpm_ibmvtpm.h index bd82a791f995..b2c231b1beec 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.h +++ b/drivers/char/tpm/tpm_ibmvtpm.h @@ -22,9 +22,9 @@ struct ibmvtpm_crq { u8 valid; u8 msg; - u16 len; - u32 data; - u64 reserved; + __be16 len; + __be32 data; + __be64 reserved; } __attribute__((packed, aligned(8))); struct ibmvtpm_crq_queue { From ddae55303da59a56c05ec39c1a4789775f591363 Mon Sep 17 00:00:00 2001 From: Torsten Fleischer Date: Tue, 24 Feb 2015 16:32:57 +0100 Subject: [PATCH 038/236] spi: atmel: Fix interrupt setup for PDC transfers commit 76e1d14b316d6f501ebc001e7a5d86b24ce5b615 upstream. Additionally to the current DMA transfer the PDC allows to set up a next DMA transfer. This is useful for larger SPI transfers. The driver currently waits for ENDRX as end of the transfer. But ENDRX is set when the current DMA transfer is done (RCR = 0), i.e. it doesn't include the next DMA transfer. Thus a subsequent SPI transfer could be started although there is currently a transfer in progress. This can cause invalid accesses to the SPI slave devices and to SPI transfer errors. This issue has been observed on a hardware with a M25P128 SPI NOR flash. So instead of ENDRX we should wait for RXBUFF. This flag is set if there is no more DMA transfer in progress (RCR = RNCR = 0). Signed-off-by: Torsten Fleischer Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-atmel.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c index 5d7b07f08326..5f8c6d2f4df7 100644 --- a/drivers/spi/spi-atmel.c +++ b/drivers/spi/spi-atmel.c @@ -781,17 +781,17 @@ static void atmel_spi_pdc_next_xfer(struct spi_master *master, (unsigned long long)xfer->rx_dma); } - /* REVISIT: We're waiting for ENDRX before we start the next + /* REVISIT: We're waiting for RXBUFF before we start the next * transfer because we need to handle some difficult timing - * issues otherwise. If we wait for ENDTX in one transfer and - * then starts waiting for ENDRX in the next, it's difficult - * to tell the difference between the ENDRX interrupt we're - * actually waiting for and the ENDRX interrupt of the + * issues otherwise. If we wait for TXBUFE in one transfer and + * then starts waiting for RXBUFF in the next, it's difficult + * to tell the difference between the RXBUFF interrupt we're + * actually waiting for and the RXBUFF interrupt of the * previous transfer. * * It should be doable, though. Just not now... */ - spi_writel(as, IER, SPI_BIT(ENDRX) | SPI_BIT(OVRES)); + spi_writel(as, IER, SPI_BIT(RXBUFF) | SPI_BIT(OVRES)); spi_writel(as, PTCR, SPI_BIT(TXTEN) | SPI_BIT(RXTEN)); } From e62c360b26b8f8bb2e88db4cfc95fa10d3503cd6 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Fri, 27 Feb 2015 16:30:21 +0100 Subject: [PATCH 039/236] spi: pl022: Fix race in giveback() leading to driver lock-up commit cd6fa8d2ca53cac3226fdcffcf763be390abae32 upstream. Commit fd316941c ("spi/pl022: disable port when unused") introduced a race, which leads to possible driver lock up (easily reproducible on SMP). The problem happens in giveback() function where the completion of the transfer is signalled to SPI subsystem and then the HW SPI controller is disabled. Another transfer might be setup in between, which brings driver in locked-up state. Exact event sequence on SMP: core0 core1 => pump_transfers() /* message->state == STATE_DONE */ => giveback() => spi_finalize_current_message() => pl022_unprepare_transfer_hardware() => pl022_transfer_one_message => flush() => do_interrupt_dma_transfer() => set_up_next_transfer() /* Enable SSP, turn on interrupts */ writew((readw(SSP_CR1(pl022->virtbase)) | SSP_CR1_MASK_SSE), SSP_CR1(pl022->virtbase)); ... => pl022_interrupt_handler() => readwriter() /* disable the SPI/SSP operation */ => writew((readw(SSP_CR1(pl022->virtbase)) & (~SSP_CR1_MASK_SSE)), SSP_CR1(pl022->virtbase)); Lockup! SPI controller is disabled and the data will never be received. Whole SPI subsystem is waiting for transfer ACK and blocked. So, only signal transfer completion after disabling the controller. Fixes: fd316941c (spi/pl022: disable port when unused) Signed-off-by: Alexander Sverdlin Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-pl022.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index 971855e859c7..fe091a87fd6d 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -503,12 +503,12 @@ static void giveback(struct pl022 *pl022) pl022->cur_msg = NULL; pl022->cur_transfer = NULL; pl022->cur_chip = NULL; - spi_finalize_current_message(pl022->master); /* disable the SPI/SSP operation */ writew((readw(SSP_CR1(pl022->virtbase)) & (~SSP_CR1_MASK_SSE)), SSP_CR1(pl022->virtbase)); + spi_finalize_current_message(pl022->master); } /** From eb3deba1b8edfcf985e6c30b082c502c75b894e7 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Thu, 12 Mar 2015 09:41:32 +0100 Subject: [PATCH 040/236] ALSA: snd-usb: add quirks for Roland UA-22 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit fcdcd1dec6d2c7b718385ec743ae5a9a233edad4 upstream. The device complies to the UAC1 standard but hides that fact with proprietary descriptors. The autodetect quirk for Roland devices catches the audio interface but misses the MIDI part, so a specific quirk is needed. Signed-off-by: Daniel Mack Reported-by: Rafa Lafuente Tested-by: Raphaël Doursenaud Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks-table.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 83bddbdb90e9..5293b5ac8b9d 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -1773,6 +1773,36 @@ YAMAHA_DEVICE(0x7010, "UB99"), } } }, +{ + USB_DEVICE(0x0582, 0x0159), + .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { + /* .vendor_name = "Roland", */ + /* .product_name = "UA-22", */ + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_COMPOSITE, + .data = (const struct snd_usb_audio_quirk[]) { + { + .ifnum = 0, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + { + .ifnum = 1, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + { + .ifnum = 2, + .type = QUIRK_MIDI_FIXED_ENDPOINT, + .data = & (const struct snd_usb_midi_endpoint_info) { + .out_cables = 0x0001, + .in_cables = 0x0001 + } + }, + { + .ifnum = -1 + } + } + } +}, /* this catches most recent vendor-specific Roland devices */ { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | From 321f094062e1d3e9f8215078d565f3e3d019d042 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Mar 2015 18:12:49 +0100 Subject: [PATCH 041/236] ALSA: control: Add sanity checks for user ctl id name string commit be3bb8236db2d0fcd705062ae2e2a9d75131222f upstream. There was no check about the id string of user control elements, so we accepted even a control element with an empty string, which is obviously bogus. This patch adds more sanity checks of id strings. Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/control.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/core/control.c b/sound/core/control.c index 98a29b26c5f4..f2082a35b890 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -1168,6 +1168,10 @@ static int snd_ctl_elem_add(struct snd_ctl_file *file, if (info->count < 1) return -EINVAL; + if (!*info->id.name) + return -EINVAL; + if (strnlen(info->id.name, sizeof(info->id.name)) >= sizeof(info->id.name)) + return -EINVAL; access = info->access == 0 ? SNDRV_CTL_ELEM_ACCESS_READWRITE : (info->access & (SNDRV_CTL_ELEM_ACCESS_READWRITE| SNDRV_CTL_ELEM_ACCESS_INACTIVE| From bdd7878b488078678dc7a04ca3b784f6992c6490 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Mar 2015 16:05:19 +0100 Subject: [PATCH 042/236] ALSA: hda - Fix built-in mic on Compaq Presario CQ60 commit ddb6ca75b5671b8fbf1909bc588c449ee74b34f9 upstream. Compaq Presario CQ60 laptop with CX20561 gives a wrong pin for the built-in mic NID 0x17 instead of NID 0x1d, and it results in the non-working mic. This patch just remaps the pin correctly via fixup. Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=920604 Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_conexant.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index ffc19464b978..976493c4a695 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -3232,6 +3232,7 @@ enum { CXT_PINCFG_LENOVO_TP410, CXT_PINCFG_LEMOTE_A1004, CXT_PINCFG_LEMOTE_A1205, + CXT_PINCFG_COMPAQ_CQ60, CXT_FIXUP_STEREO_DMIC, CXT_FIXUP_INC_MIC_BOOST, CXT_FIXUP_HEADPHONE_MIC_PIN, @@ -3368,6 +3369,15 @@ static const struct hda_fixup cxt_fixups[] = { .type = HDA_FIXUP_PINS, .v.pins = cxt_pincfg_lemote, }, + [CXT_PINCFG_COMPAQ_CQ60] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + /* 0x17 was falsely set up as a mic, it should 0x1d */ + { 0x17, 0x400001f0 }, + { 0x1d, 0x97a70120 }, + { } + } + }, [CXT_FIXUP_STEREO_DMIC] = { .type = HDA_FIXUP_FUNC, .v.func = cxt_fixup_stereo_dmic, @@ -3411,6 +3421,7 @@ static const struct hda_fixup cxt_fixups[] = { }; static const struct snd_pci_quirk cxt5051_fixups[] = { + SND_PCI_QUIRK(0x103c, 0x360b, "Compaq CQ60", CXT_PINCFG_COMPAQ_CQ60), SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo X200", CXT_PINCFG_LENOVO_X200), {} }; From 69699bb5779e3c2a0dc0ad36466eb86f277afdfc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 12 Mar 2015 08:30:11 +0100 Subject: [PATCH 043/236] ALSA: hda - Don't access stereo amps for mono channel widgets commit ef403edb75580a3ec5d155f5de82155f0419c621 upstream. The current HDA generic parser initializes / modifies the amp values always in stereo, but this seems causing the problem on ALC3229 codec that has a few mono channel widgets: namely, these mono widgets react to actions for both channels equally. In the driver code, we do care the mono channel and create a control only for the left channel (as defined in HD-audio spec) for such a node. When the control is updated, only the left channel value is changed. However, in the resume, the right channel value is also restored from the initial value we took as stereo, and this overwrites the left channel value. This ends up being the silent output as the right channel has been never touched and remains muted. This patch covers the places where unconditional stereo amp accesses are done and converts to the conditional accesses. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=94581 Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_generic.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index d9a09bdd09db..97a3b4214914 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -658,7 +658,23 @@ static void init_amp(struct hda_codec *codec, hda_nid_t nid, int dir, int idx) { unsigned int caps = query_amp_caps(codec, nid, dir); int val = get_amp_val_to_activate(codec, nid, dir, caps, false); - snd_hda_codec_amp_init_stereo(codec, nid, dir, idx, 0xff, val); + + if (get_wcaps(codec, nid) & AC_WCAP_STEREO) + snd_hda_codec_amp_init_stereo(codec, nid, dir, idx, 0xff, val); + else + snd_hda_codec_amp_init(codec, nid, 0, dir, idx, 0xff, val); +} + +/* update the amp, doing in stereo or mono depending on NID */ +static int update_amp(struct hda_codec *codec, hda_nid_t nid, int dir, int idx, + unsigned int mask, unsigned int val) +{ + if (get_wcaps(codec, nid) & AC_WCAP_STEREO) + return snd_hda_codec_amp_stereo(codec, nid, dir, idx, + mask, val); + else + return snd_hda_codec_amp_update(codec, nid, 0, dir, idx, + mask, val); } /* calculate amp value mask we can modify; @@ -698,7 +714,7 @@ static void activate_amp(struct hda_codec *codec, hda_nid_t nid, int dir, return; val &= mask; - snd_hda_codec_amp_stereo(codec, nid, dir, idx, mask, val); + update_amp(codec, nid, dir, idx, mask, val); } static void activate_amp_out(struct hda_codec *codec, struct nid_path *path, @@ -4337,13 +4353,11 @@ static void mute_all_mixer_nid(struct hda_codec *codec, hda_nid_t mix) has_amp = nid_has_mute(codec, mix, HDA_INPUT); for (i = 0; i < nums; i++) { if (has_amp) - snd_hda_codec_amp_stereo(codec, mix, - HDA_INPUT, i, - 0xff, HDA_AMP_MUTE); + update_amp(codec, mix, HDA_INPUT, i, + 0xff, HDA_AMP_MUTE); else if (nid_has_volume(codec, conn[i], HDA_OUTPUT)) - snd_hda_codec_amp_stereo(codec, conn[i], - HDA_OUTPUT, 0, - 0xff, HDA_AMP_MUTE); + update_amp(codec, conn[i], HDA_OUTPUT, 0, + 0xff, HDA_AMP_MUTE); } } From ff5af001ff1539e762f0d5d054ca476376f3fa69 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 12 Mar 2015 20:28:04 +0100 Subject: [PATCH 044/236] ALSA: hda - Set single_adc_amp flag for CS420x codecs commit bad994f5b4ab57eec8d56c180edca00505c3eeb2 upstream. CS420x codecs seem to deal only the single amps of ADC nodes even though the nodes receive multiple inputs. This leads to the inconsistent amp value after S3/S4 resume, for example. The fix is just to set codec->single_adc_amp flag. Then the driver handles these ADC amps as if single connections. Reported-and-tested-by: Vasil Zlatanov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_cirrus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c index fc492ac24caa..00031e98cbd7 100644 --- a/sound/pci/hda/patch_cirrus.c +++ b/sound/pci/hda/patch_cirrus.c @@ -587,6 +587,7 @@ static int patch_cs420x(struct hda_codec *codec) return -ENOMEM; spec->gen.automute_hook = cs_automute; + codec->single_adc_amp = 1; snd_hda_pick_fixup(codec, cs420x_models, cs420x_fixup_tbl, cs420x_fixups); From dc83efb092af95df9b4cdc460fc63f3144bfb0f6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 12 Mar 2015 20:47:15 +0100 Subject: [PATCH 045/236] ALSA: hda - Add workaround for MacBook Air 5,2 built-in mic commit 2ddee91abe9cc34ddb6294ee14702b46ae07d460 upstream. MacBook Air 5,2 has the same problem as MacBook Pro 8,1 where the built-in mic records only the right channel. Apply the same workaround as MBP8,1 to spread the mono channel via a Cirrus codec vendor-specific COEF setup. Reported-and-tested-by: Vasil Zlatanov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_cirrus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c index 00031e98cbd7..51e208022cc8 100644 --- a/sound/pci/hda/patch_cirrus.c +++ b/sound/pci/hda/patch_cirrus.c @@ -396,6 +396,7 @@ static const struct snd_pci_quirk cs420x_fixup_tbl[] = { SND_PCI_QUIRK(0x106b, 0x1c00, "MacBookPro 8,1", CS420X_MBP81), SND_PCI_QUIRK(0x106b, 0x2000, "iMac 12,2", CS420X_IMAC27_122), SND_PCI_QUIRK(0x106b, 0x2800, "MacBookPro 10,1", CS420X_MBP101), + SND_PCI_QUIRK(0x106b, 0x5600, "MacBookAir 5,2", CS420X_MBP81), SND_PCI_QUIRK(0x106b, 0x5b00, "MacBookAir 4,2", CS420X_MBA42), SND_PCI_QUIRK_VENDOR(0x106b, "Apple", CS420X_APPLE), {} /* terminator */ From b62e3b444198b20d3b0eef7a06779a48231fe12b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 8 Mar 2015 18:29:50 +0100 Subject: [PATCH 046/236] ALSA: hda - Fix regression of HD-audio controller fallback modes commit a1f3f1ca66bd12c339b17a0c2ef93a093f90a277 upstream. The commit [63e51fd708f5: ALSA: hda - Don't take unresponsive D3 transition too serious] introduced a conditional fallback behavior to the HD-audio controller depending on the flag set. However, it introduced a silly bug, too, that the flag was evaluated in a reverse way. This resulted in a regression of HD-audio controller driver where it can't go to the fallback mode at communication errors. Unfortunately (or fortunately?) this didn't come up until recently because the affected code path is an error handling that happens only on an unstable hardware chip. Most of recent chips work stably, thus they didn't hit this problem. Now, we've got a regression report with a VIA chip, and this seems indeed requiring the fallback to the polling mode, and finally the bug was revealed. The fix is a oneliner to remove the wrong logical NOT in the check. (Lesson learned - be careful about double negation.) The bug should be backported to stable, but the patch won't be applicable to 3.13 or earlier because of the code splits. The stable fix patches for earlier kernels will be posted later manually. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=94021 Fixes: 63e51fd708f5 ('ALSA: hda - Don't take unresponsive D3 transition too serious') Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 2f3059b50ffa..84e8879cc372 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -959,7 +959,7 @@ static unsigned int azx_rirb_get_response(struct hda_bus *bus, } } - if (!bus->no_response_fallback) + if (bus->no_response_fallback) return -1; if (!chip->polling_mode && chip->poll_count < 2) { From 9e580aea6979631799cdaa94ff1d67856ccfce90 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 16 Mar 2015 10:18:08 +0100 Subject: [PATCH 047/236] ALSA: hda - Treat stereo-to-mono mix properly commit cc261738add93947d138d2fabad9f4dbed4e5c00 upstream. The commit [ef403edb7558: ALSA: hda - Don't access stereo amps for mono channel widgets] fixed the handling of mono widgets in general, but it still misses an exceptional case: namely, a mono mixer widget taking a single stereo input. In this case, it has stereo volumes although it's a mono widget, and thus we have to take care of both left and right input channels, as stated in HD-audio spec ("7.1.3 Widget Interconnection Rules"). This patch covers this missing piece by adding proper checks of stereo amps in both the generic parser and the proc output codes. Reported-by: Raymond Yau Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_generic.c | 21 ++++++++++++++++++-- sound/pci/hda/hda_proc.c | 38 +++++++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index 97a3b4214914..9a23bdea97d8 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -653,13 +653,30 @@ static int get_amp_val_to_activate(struct hda_codec *codec, hda_nid_t nid, return val; } +/* is this a stereo widget or a stereo-to-mono mix? */ +static bool is_stereo_amps(struct hda_codec *codec, hda_nid_t nid, int dir) +{ + unsigned int wcaps = get_wcaps(codec, nid); + hda_nid_t conn; + + if (wcaps & AC_WCAP_STEREO) + return true; + if (dir != HDA_INPUT || get_wcaps_type(wcaps) != AC_WID_AUD_MIX) + return false; + if (snd_hda_get_num_conns(codec, nid) != 1) + return false; + if (snd_hda_get_connections(codec, nid, &conn, 1) < 0) + return false; + return !!(get_wcaps(codec, conn) & AC_WCAP_STEREO); +} + /* initialize the amp value (only at the first time) */ static void init_amp(struct hda_codec *codec, hda_nid_t nid, int dir, int idx) { unsigned int caps = query_amp_caps(codec, nid, dir); int val = get_amp_val_to_activate(codec, nid, dir, caps, false); - if (get_wcaps(codec, nid) & AC_WCAP_STEREO) + if (is_stereo_amps(codec, nid, dir)) snd_hda_codec_amp_init_stereo(codec, nid, dir, idx, 0xff, val); else snd_hda_codec_amp_init(codec, nid, 0, dir, idx, 0xff, val); @@ -669,7 +686,7 @@ static void init_amp(struct hda_codec *codec, hda_nid_t nid, int dir, int idx) static int update_amp(struct hda_codec *codec, hda_nid_t nid, int dir, int idx, unsigned int mask, unsigned int val) { - if (get_wcaps(codec, nid) & AC_WCAP_STEREO) + if (is_stereo_amps(codec, nid, dir)) return snd_hda_codec_amp_stereo(codec, nid, dir, idx, mask, val); else diff --git a/sound/pci/hda/hda_proc.c b/sound/pci/hda/hda_proc.c index ce5a6da83419..05e19f78b4cb 100644 --- a/sound/pci/hda/hda_proc.c +++ b/sound/pci/hda/hda_proc.c @@ -134,13 +134,38 @@ static void print_amp_caps(struct snd_info_buffer *buffer, (caps & AC_AMPCAP_MUTE) >> AC_AMPCAP_MUTE_SHIFT); } +/* is this a stereo widget or a stereo-to-mono mix? */ +static bool is_stereo_amps(struct hda_codec *codec, hda_nid_t nid, + int dir, unsigned int wcaps, int indices) +{ + hda_nid_t conn; + + if (wcaps & AC_WCAP_STEREO) + return true; + /* check for a stereo-to-mono mix; it must be: + * only a single connection, only for input, and only a mixer widget + */ + if (indices != 1 || dir != HDA_INPUT || + get_wcaps_type(wcaps) != AC_WID_AUD_MIX) + return false; + + if (snd_hda_get_raw_connections(codec, nid, &conn, 1) < 0) + return false; + /* the connection source is a stereo? */ + wcaps = snd_hda_param_read(codec, conn, AC_PAR_AUDIO_WIDGET_CAP); + return !!(wcaps & AC_WCAP_STEREO); +} + static void print_amp_vals(struct snd_info_buffer *buffer, struct hda_codec *codec, hda_nid_t nid, - int dir, int stereo, int indices) + int dir, unsigned int wcaps, int indices) { unsigned int val; + bool stereo; int i; + stereo = is_stereo_amps(codec, nid, dir, wcaps, indices); + dir = dir == HDA_OUTPUT ? AC_AMP_GET_OUTPUT : AC_AMP_GET_INPUT; for (i = 0; i < indices; i++) { snd_iprintf(buffer, " ["); @@ -757,12 +782,10 @@ static void print_codec_info(struct snd_info_entry *entry, (codec->single_adc_amp && wid_type == AC_WID_AUD_IN)) print_amp_vals(buffer, codec, nid, HDA_INPUT, - wid_caps & AC_WCAP_STEREO, - 1); + wid_caps, 1); else print_amp_vals(buffer, codec, nid, HDA_INPUT, - wid_caps & AC_WCAP_STEREO, - conn_len); + wid_caps, conn_len); } if (wid_caps & AC_WCAP_OUT_AMP) { snd_iprintf(buffer, " Amp-Out caps: "); @@ -771,11 +794,10 @@ static void print_codec_info(struct snd_info_entry *entry, if (wid_type == AC_WID_PIN && codec->pin_amp_workaround) print_amp_vals(buffer, codec, nid, HDA_OUTPUT, - wid_caps & AC_WCAP_STEREO, - conn_len); + wid_caps, conn_len); else print_amp_vals(buffer, codec, nid, HDA_OUTPUT, - wid_caps & AC_WCAP_STEREO, 1); + wid_caps, 1); } switch (wid_type) { From a9395cba088ceb0328892bca1fd18b45ebc3bb22 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 18 Feb 2015 11:32:07 +0100 Subject: [PATCH 048/236] mtd: nand: pxa3xx: Fix PIO FIFO draining commit 8dad0386b97c4bd6edd56752ca7f2e735fe5beb4 upstream. The NDDB register holds the data that are needed by the read and write commands. However, during a read PIO access, the datasheet specifies that after each 32 bytes read in that register, when BCH is enabled, we have to make sure that the RDDREQ bit is set in the NDSR register. This fixes an issue that was seen on the Armada 385, and presumably other mvebu SoCs, when a read on a newly erased page would end up in the driver reporting a timeout from the NAND. Signed-off-by: Maxime Ripard Reviewed-by: Boris Brezillon Acked-by: Ezequiel Garcia Signed-off-by: Brian Norris Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/nand/pxa3xx_nand.c | 48 +++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index 51e15fd53108..d058b00ba218 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -481,6 +481,42 @@ static void disable_int(struct pxa3xx_nand_info *info, uint32_t int_mask) nand_writel(info, NDCR, ndcr | int_mask); } +static void drain_fifo(struct pxa3xx_nand_info *info, void *data, int len) +{ + if (info->ecc_bch) { + int timeout; + + /* + * According to the datasheet, when reading from NDDB + * with BCH enabled, after each 32 bytes reads, we + * have to make sure that the NDSR.RDDREQ bit is set. + * + * Drain the FIFO 8 32 bits reads at a time, and skip + * the polling on the last read. + */ + while (len > 8) { + __raw_readsl(info->mmio_base + NDDB, data, 8); + + for (timeout = 0; + !(nand_readl(info, NDSR) & NDSR_RDDREQ); + timeout++) { + if (timeout >= 5) { + dev_err(&info->pdev->dev, + "Timeout on RDDREQ while draining the FIFO\n"); + return; + } + + mdelay(1); + } + + data += 32; + len -= 8; + } + } + + __raw_readsl(info->mmio_base + NDDB, data, len); +} + static void handle_data_pio(struct pxa3xx_nand_info *info) { unsigned int do_bytes = min(info->data_size, info->chunk_size); @@ -497,14 +533,14 @@ static void handle_data_pio(struct pxa3xx_nand_info *info) DIV_ROUND_UP(info->oob_size, 4)); break; case STATE_PIO_READING: - __raw_readsl(info->mmio_base + NDDB, - info->data_buff + info->data_buff_pos, - DIV_ROUND_UP(do_bytes, 4)); + drain_fifo(info, + info->data_buff + info->data_buff_pos, + DIV_ROUND_UP(do_bytes, 4)); if (info->oob_size > 0) - __raw_readsl(info->mmio_base + NDDB, - info->oob_buff + info->oob_buff_pos, - DIV_ROUND_UP(info->oob_size, 4)); + drain_fifo(info, + info->oob_buff + info->oob_buff_pos, + DIV_ROUND_UP(info->oob_size, 4)); break; default: dev_err(&info->pdev->dev, "%s: invalid state %d\n", __func__, From 5c7a4c4def1e14ff083966ef7ffec6d2ee3bace1 Mon Sep 17 00:00:00 2001 From: Brian King Date: Wed, 4 Mar 2015 08:09:44 -0600 Subject: [PATCH 049/236] bnx2x: Force fundamental reset for EEH recovery commit da293700568ed3d96fcf062ac15d7d7c41377f11 upstream. EEH recovery for bnx2x based adapters is not reliable on all Power systems using the default hot reset, which can result in an unrecoverable EEH error. Forcing the use of fundamental reset during EEH recovery fixes this. Signed-off-by: Brian King Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 7d4382286457..242874041ba4 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -12395,6 +12395,9 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev, pci_write_config_dword(bp->pdev, PCICFG_GRC_ADDRESS, PCICFG_VENDOR_ID_OFFSET); + /* Set PCIe reset type to fundamental for EEH recovery */ + pdev->needs_freset = 1; + /* AER (Advanced Error reporting) configuration */ rc = pci_enable_pcie_error_reporting(pdev); if (!rc) From 6939dad78726fbe2f6dd5f66d08e0ab1d2002e03 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Mon, 2 Mar 2015 21:40:39 +0100 Subject: [PATCH 050/236] regulator: Only enable disabled regulators on resume commit 0548bf4f5ad6fc3bd93c4940fa48078b34609682 upstream. The _regulator_do_enable() call ought to be a no-op when called on an already-enabled regulator. However, as an optimization _regulator_enable() doesn't call _regulator_do_enable() on an already enabled regulator. That means we never test the case of calling _regulator_do_enable() during normal usage and there may be hidden bugs or warnings. We have seen warnings issued by the tps65090 driver and bugs when using the GPIO enable pin. Let's match the same optimization that _regulator_enable() in regulator_suspend_finish(). That may speed up suspend/resume and also avoids exposing hidden bugs. [Use much clearer commit message from Doug Anderson] Signed-off-by: Javier Martinez Canillas Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/regulator/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index b7984044232d..2dfffca10dec 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -3631,9 +3631,11 @@ int regulator_suspend_finish(void) list_for_each_entry(rdev, ®ulator_list, list) { mutex_lock(&rdev->mutex); if (rdev->use_count > 0 || rdev->constraints->always_on) { - error = _regulator_do_enable(rdev); - if (error) - ret = error; + if (!_regulator_is_enabled(rdev)) { + error = _regulator_do_enable(rdev); + if (error) + ret = error; + } } else { if (!have_full_constraints()) goto unlock; From 32b783339fd88aaef35840d2695ce430680db807 Mon Sep 17 00:00:00 2001 From: Doug Anderson Date: Tue, 3 Mar 2015 15:20:47 -0800 Subject: [PATCH 051/236] regulator: core: Fix enable GPIO reference counting commit 29d62ec5f87fbeec8413e2215ddad12e7f972e4c upstream. Normally _regulator_do_enable() isn't called on an already-enabled rdev. That's because the main caller, _regulator_enable() always calls _regulator_is_enabled() and only calls _regulator_do_enable() if the rdev was not already enabled. However, there is one caller of _regulator_do_enable() that doesn't check: regulator_suspend_finish(). While we might want to make regulator_suspend_finish() behave more like _regulator_enable(), it's probably also a good idea to make _regulator_do_enable() robust if it is called on an already enabled rdev. At the moment, _regulator_do_enable() is _not_ robust for already enabled rdevs if we're using an ena_pin. Each time _regulator_do_enable() is called for an rdev using an ena_pin the reference count of the ena_pin is incremented even if the rdev was already enabled. This is not as intended because the ena_pin is for something else: for keeping track of how many active rdevs there are sharing the same ena_pin. Here's how the reference counting works here: * Each time _regulator_enable() is called we increment rdev->use_count, so _regulator_enable() calls need to be balanced with _regulator_disable() calls. * There is no explicit reference counting in _regulator_do_enable() which is normally just a warapper around rdev->desc->ops->enable() with code for supporting delays. It's not expected that the "ops->enable()" call do reference counting. * Since regulator_ena_gpio_ctrl() does have reference counting (handling the sharing of the pin amongst multiple rdevs), we shouldn't call it if the current rdev is already enabled. Note that as part of this we cleanup (remove) the initting of ena_gpio_state in regulator_register(). In _regulator_do_enable(), _regulator_do_disable() and _regulator_is_enabled() is is clear that ena_gpio_state should be the state of whether this particular rdev has requested the GPIO be enabled. regulator_register() was initting it as the actual state of the pin. Fixes: 967cfb18c0e3 ("regulator: core: manage enable GPIO list") Signed-off-by: Doug Anderson Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/regulator/core.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 2dfffca10dec..5d8d2dcd975e 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1764,10 +1764,12 @@ static int _regulator_do_enable(struct regulator_dev *rdev) trace_regulator_enable(rdev_get_name(rdev)); if (rdev->ena_pin) { - ret = regulator_ena_gpio_ctrl(rdev, true); - if (ret < 0) - return ret; - rdev->ena_gpio_state = 1; + if (!rdev->ena_gpio_state) { + ret = regulator_ena_gpio_ctrl(rdev, true); + if (ret < 0) + return ret; + rdev->ena_gpio_state = 1; + } } else if (rdev->desc->ops->enable) { ret = rdev->desc->ops->enable(rdev); if (ret < 0) @@ -1897,10 +1899,12 @@ static int _regulator_do_disable(struct regulator_dev *rdev) trace_regulator_disable(rdev_get_name(rdev)); if (rdev->ena_pin) { - ret = regulator_ena_gpio_ctrl(rdev, false); - if (ret < 0) - return ret; - rdev->ena_gpio_state = 0; + if (rdev->ena_gpio_state) { + ret = regulator_ena_gpio_ctrl(rdev, false); + if (ret < 0) + return ret; + rdev->ena_gpio_state = 0; + } } else if (rdev->desc->ops->disable) { ret = rdev->desc->ops->disable(rdev); @@ -3454,12 +3458,6 @@ regulator_register(const struct regulator_desc *regulator_desc, config->ena_gpio, ret); goto wash; } - - if (config->ena_gpio_flags & GPIOF_OUT_INIT_HIGH) - rdev->ena_gpio_state = 1; - - if (config->ena_gpio_invert) - rdev->ena_gpio_state = !rdev->ena_gpio_state; } /* set regulator constraints */ From 64580cdff828d554d78010a8b19fef0408fdd2bd Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 12 Mar 2015 16:26:00 -0700 Subject: [PATCH 052/236] nilfs2: fix deadlock of segment constructor during recovery commit 283ee1482f349d6c0c09dfb725db5880afc56813 upstream. According to a report from Yuxuan Shui, nilfs2 in kernel 3.19 got stuck during recovery at mount time. The code path that caused the deadlock was as follows: nilfs_fill_super() load_nilfs() nilfs_salvage_orphan_logs() * Do roll-forwarding, attach segment constructor for recovery, and kick it. nilfs_segctor_thread() nilfs_segctor_thread_construct() * A lock is held with nilfs_transaction_lock() nilfs_segctor_do_construct() nilfs_segctor_drop_written_files() iput() iput_final() write_inode_now() writeback_single_inode() __writeback_single_inode() do_writepages() nilfs_writepage() nilfs_construct_dsync_segment() nilfs_transaction_lock() --> deadlock This can happen if commit 7ef3ff2fea8b ("nilfs2: fix deadlock of segment constructor over I_SYNC flag") is applied and roll-forward recovery was performed at mount time. The roll-forward recovery can happen if datasync write is done and the file system crashes immediately after that. For instance, we can reproduce the issue with the following steps: < nilfs2 is mounted on /nilfs (device: /dev/sdb1) > # dd if=/dev/zero of=/nilfs/test bs=4k count=1 && sync # dd if=/dev/zero of=/nilfs/test conv=notrunc oflag=dsync bs=4k count=1 && reboot -nfh < the system will immediately reboot > # mount -t nilfs2 /dev/sdb1 /nilfs The deadlock occurs because iput() can run segment constructor through writeback_single_inode() if MS_ACTIVE flag is not set on sb->s_flags. The above commit changed segment constructor so that it calls iput() asynchronously for inodes with i_nlink == 0, but that change was imperfect. This fixes the another deadlock by deferring iput() in segment constructor even for the case that mount is not finished, that is, for the case that MS_ACTIVE flag is not set. Signed-off-by: Ryusuke Konishi Reported-by: Yuxuan Shui Tested-by: Ryusuke Konishi Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/segment.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 5bee81674d53..14538a865102 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1906,6 +1906,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; + int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE); int defer_iput = false; spin_lock(&nilfs->ns_inode_lock); @@ -1918,10 +1919,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, brelse(ii->i_bh); ii->i_bh = NULL; list_del_init(&ii->i_dirty); - if (!ii->vfs_inode.i_nlink) { + if (!ii->vfs_inode.i_nlink || during_mount) { /* - * Defer calling iput() to avoid a deadlock - * over I_SYNC flag for inodes with i_nlink == 0 + * Defer calling iput() to avoid deadlocks if + * i_nlink == 0 or mount is not yet finished. */ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); defer_iput = true; From 472cd1f6c9bfb48d5bab283dbf123837dd4e2cbb Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Thu, 5 Mar 2015 02:33:24 -0800 Subject: [PATCH 053/236] drm/vmwgfx: Reorder device takedown somewhat commit 3458390b9f0ba784481d23134798faee27b5f16f upstream. To take down the MOB and GMR memory types, the driver may have to issue fence objects and thus make sure that the fence manager is taken down after those memory types. Reorder device init accordingly. Signed-off-by: Thomas Hellstrom Reviewed-by: Sinclair Yeh Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 77 +++++++++++++++-------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index fb7c36e93fd4..0771dcbf9ed0 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -733,32 +733,6 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset) goto out_err1; } - ret = ttm_bo_init_mm(&dev_priv->bdev, TTM_PL_VRAM, - (dev_priv->vram_size >> PAGE_SHIFT)); - if (unlikely(ret != 0)) { - DRM_ERROR("Failed initializing memory manager for VRAM.\n"); - goto out_err2; - } - - dev_priv->has_gmr = true; - if (((dev_priv->capabilities & (SVGA_CAP_GMR | SVGA_CAP_GMR2)) == 0) || - refuse_dma || ttm_bo_init_mm(&dev_priv->bdev, VMW_PL_GMR, - VMW_PL_GMR) != 0) { - DRM_INFO("No GMR memory available. " - "Graphics memory resources are very limited.\n"); - dev_priv->has_gmr = false; - } - - if (dev_priv->capabilities & SVGA_CAP_GBOBJECTS) { - dev_priv->has_mob = true; - if (ttm_bo_init_mm(&dev_priv->bdev, VMW_PL_MOB, - VMW_PL_MOB) != 0) { - DRM_INFO("No MOB memory available. " - "3D will be disabled.\n"); - dev_priv->has_mob = false; - } - } - dev_priv->mmio_mtrr = arch_phys_wc_add(dev_priv->mmio_start, dev_priv->mmio_size); @@ -821,6 +795,33 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset) goto out_no_fman; } + + ret = ttm_bo_init_mm(&dev_priv->bdev, TTM_PL_VRAM, + (dev_priv->vram_size >> PAGE_SHIFT)); + if (unlikely(ret != 0)) { + DRM_ERROR("Failed initializing memory manager for VRAM.\n"); + goto out_no_vram; + } + + dev_priv->has_gmr = true; + if (((dev_priv->capabilities & (SVGA_CAP_GMR | SVGA_CAP_GMR2)) == 0) || + refuse_dma || ttm_bo_init_mm(&dev_priv->bdev, VMW_PL_GMR, + VMW_PL_GMR) != 0) { + DRM_INFO("No GMR memory available. " + "Graphics memory resources are very limited.\n"); + dev_priv->has_gmr = false; + } + + if (dev_priv->capabilities & SVGA_CAP_GBOBJECTS) { + dev_priv->has_mob = true; + if (ttm_bo_init_mm(&dev_priv->bdev, VMW_PL_MOB, + VMW_PL_MOB) != 0) { + DRM_INFO("No MOB memory available. " + "3D will be disabled.\n"); + dev_priv->has_mob = false; + } + } + vmw_kms_save_vga(dev_priv); /* Start kms and overlay systems, needs fifo. */ @@ -846,6 +847,12 @@ out_no_fifo: vmw_kms_close(dev_priv); out_no_kms: vmw_kms_restore_vga(dev_priv); + if (dev_priv->has_mob) + (void) ttm_bo_clean_mm(&dev_priv->bdev, VMW_PL_MOB); + if (dev_priv->has_gmr) + (void) ttm_bo_clean_mm(&dev_priv->bdev, VMW_PL_GMR); + (void)ttm_bo_clean_mm(&dev_priv->bdev, TTM_PL_VRAM); +out_no_vram: vmw_fence_manager_takedown(dev_priv->fman); out_no_fman: if (dev_priv->capabilities & SVGA_CAP_IRQMASK) @@ -861,12 +868,6 @@ out_err4: iounmap(dev_priv->mmio_virt); out_err3: arch_phys_wc_del(dev_priv->mmio_mtrr); - if (dev_priv->has_mob) - (void) ttm_bo_clean_mm(&dev_priv->bdev, VMW_PL_MOB); - if (dev_priv->has_gmr) - (void) ttm_bo_clean_mm(&dev_priv->bdev, VMW_PL_GMR); - (void)ttm_bo_clean_mm(&dev_priv->bdev, TTM_PL_VRAM); -out_err2: (void)ttm_bo_device_release(&dev_priv->bdev); out_err1: vmw_ttm_global_release(dev_priv); @@ -896,6 +897,13 @@ static int vmw_driver_unload(struct drm_device *dev) } vmw_kms_close(dev_priv); vmw_overlay_close(dev_priv); + + if (dev_priv->has_mob) + (void) ttm_bo_clean_mm(&dev_priv->bdev, VMW_PL_MOB); + if (dev_priv->has_gmr) + (void)ttm_bo_clean_mm(&dev_priv->bdev, VMW_PL_GMR); + (void)ttm_bo_clean_mm(&dev_priv->bdev, TTM_PL_VRAM); + vmw_fence_manager_takedown(dev_priv->fman); if (dev_priv->capabilities & SVGA_CAP_IRQMASK) drm_irq_uninstall(dev_priv->dev); @@ -907,11 +915,6 @@ static int vmw_driver_unload(struct drm_device *dev) ttm_object_device_release(&dev_priv->tdev); iounmap(dev_priv->mmio_virt); arch_phys_wc_del(dev_priv->mmio_mtrr); - if (dev_priv->has_mob) - (void) ttm_bo_clean_mm(&dev_priv->bdev, VMW_PL_MOB); - if (dev_priv->has_gmr) - (void)ttm_bo_clean_mm(&dev_priv->bdev, VMW_PL_GMR); - (void)ttm_bo_clean_mm(&dev_priv->bdev, TTM_PL_VRAM); (void)ttm_bo_device_release(&dev_priv->bdev); vmw_ttm_global_release(dev_priv); From a5b8311fe0adc3d83b353c8e13899ea71bc51723 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 26 Feb 2015 06:52:05 +0100 Subject: [PATCH 054/236] xen/events: avoid NULL pointer dereference in dom0 on large machines commit 85e40b0539b24518c8bdf63e2605c8522377d00f upstream. Using the pvops kernel a NULL pointer dereference was detected on a large machine (144 processors) when booting as dom0 in evtchn_fifo_unmask() during assignment of a pirq. The event channel in question was the first to need a new entry in event_array[] in events_fifo.c. Unfortunately xen_irq_info_pirq_setup() is called with evtchn being 0 for a new pirq and the real event channel number is assigned to the pirq only during __startup_pirq(). It is mandatory to call xen_evtchn_port_setup() after assigning the event channel number to the pirq to make sure all memory needed for the event channel is allocated. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel Signed-off-by: Greg Kroah-Hartman --- drivers/xen/events/events_base.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index f4a9e3311297..c8860a8757ac 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -547,20 +547,26 @@ static unsigned int __startup_pirq(unsigned int irq) pirq_query_unmask(irq); rc = set_evtchn_to_irq(evtchn, irq); - if (rc != 0) { - pr_err("irq%d: Failed to set port to irq mapping (%d)\n", - irq, rc); - xen_evtchn_close(evtchn); - return 0; - } + if (rc) + goto err; + bind_evtchn_to_cpu(evtchn, 0); info->evtchn = evtchn; + rc = xen_evtchn_port_setup(info); + if (rc) + goto err; + out: unmask_evtchn(evtchn); eoi_pirq(irq_get_irq_data(irq)); return 0; + +err: + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc); + xen_evtchn_close(evtchn); + return 0; } static unsigned int startup_pirq(struct irq_data *data) From 9cb8c24e612f1724fa85872205a1dfae7603d874 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 11 Mar 2015 13:51:17 +0000 Subject: [PATCH 055/236] xen-pciback: limit guest control of command register commit af6fc858a35b90e89ea7a7ee58e66628c55c776b upstream. Otherwise the guest can abuse that control to cause e.g. PCIe Unsupported Request responses by disabling memory and/or I/O decoding and subsequently causing (CPU side) accesses to the respective address ranges, which (depending on system configuration) may be fatal to the host. Note that to alter any of the bits collected together as PCI_COMMAND_GUEST permissive mode is now required to be enabled globally or on the specific device. This is CVE-2015-2150 / XSA-120. Signed-off-by: Jan Beulich Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xen-pciback/conf_space.c | 2 +- drivers/xen/xen-pciback/conf_space.h | 2 + drivers/xen/xen-pciback/conf_space_header.c | 59 ++++++++++++++++----- 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c index 46ae0f9f02ad..75fe3d466515 100644 --- a/drivers/xen/xen-pciback/conf_space.c +++ b/drivers/xen/xen-pciback/conf_space.c @@ -16,7 +16,7 @@ #include "conf_space.h" #include "conf_space_quirks.h" -static bool permissive; +bool permissive; module_param(permissive, bool, 0644); /* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word, diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h index e56c934ad137..2e1d73d1d5d0 100644 --- a/drivers/xen/xen-pciback/conf_space.h +++ b/drivers/xen/xen-pciback/conf_space.h @@ -64,6 +64,8 @@ struct config_field_entry { void *data; }; +extern bool permissive; + #define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) /* Add fields to a device - the add_fields macro expects to get a pointer to diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index c5ee82587e8c..2d7369391472 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -11,6 +11,10 @@ #include "pciback.h" #include "conf_space.h" +struct pci_cmd_info { + u16 val; +}; + struct pci_bar_info { u32 val; u32 len_val; @@ -20,21 +24,35 @@ struct pci_bar_info { #define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) #define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) +/* Bits guests are allowed to control in permissive mode. */ +#define PCI_COMMAND_GUEST (PCI_COMMAND_MASTER|PCI_COMMAND_SPECIAL| \ + PCI_COMMAND_INVALIDATE|PCI_COMMAND_VGA_PALETTE| \ + PCI_COMMAND_WAIT|PCI_COMMAND_FAST_BACK) + +static void *command_init(struct pci_dev *dev, int offset) +{ + struct pci_cmd_info *cmd = kmalloc(sizeof(*cmd), GFP_KERNEL); + int err; + + if (!cmd) + return ERR_PTR(-ENOMEM); + + err = pci_read_config_word(dev, PCI_COMMAND, &cmd->val); + if (err) { + kfree(cmd); + return ERR_PTR(err); + } + + return cmd; +} + static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) { - int i; - int ret; + int ret = pci_read_config_word(dev, offset, value); + const struct pci_cmd_info *cmd = data; - ret = xen_pcibk_read_config_word(dev, offset, value, data); - if (!pci_is_enabled(dev)) - return ret; - - for (i = 0; i < PCI_ROM_RESOURCE; i++) { - if (dev->resource[i].flags & IORESOURCE_IO) - *value |= PCI_COMMAND_IO; - if (dev->resource[i].flags & IORESOURCE_MEM) - *value |= PCI_COMMAND_MEMORY; - } + *value &= PCI_COMMAND_GUEST; + *value |= cmd->val & ~PCI_COMMAND_GUEST; return ret; } @@ -43,6 +61,8 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) { struct xen_pcibk_dev_data *dev_data; int err; + u16 val; + struct pci_cmd_info *cmd = data; dev_data = pci_get_drvdata(dev); if (!pci_is_enabled(dev) && is_enable_cmd(value)) { @@ -83,6 +103,19 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) } } + cmd->val = value; + + if (!permissive && (!dev_data || !dev_data->permissive)) + return 0; + + /* Only allow the guest to control certain bits. */ + err = pci_read_config_word(dev, offset, &val); + if (err || val == value) + return err; + + value &= PCI_COMMAND_GUEST; + value |= val & ~PCI_COMMAND_GUEST; + return pci_write_config_word(dev, offset, value); } @@ -282,6 +315,8 @@ static const struct config_field header_common[] = { { .offset = PCI_COMMAND, .size = 2, + .init = command_init, + .release = bar_release, .u.w.read = command_read, .u.w.write = command_write, }, From bb4ed9fd150772a0eb19c7db0ab2952127f8cab0 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 4 Mar 2015 16:18:33 -0800 Subject: [PATCH 056/236] libsas: Fix Kernel Crash in smp_execute_task commit 6302ce4d80aa82b3fdb5c5cd68e7268037091b47 upstream. This crash was reported: [ 366.947370] sd 3:0:1:0: [sdb] Spinning up disk.... [ 368.804046] BUG: unable to handle kernel NULL pointer dereference at (null) [ 368.804072] IP: [] __mutex_lock_common.isra.7+0x9c/0x15b [ 368.804098] PGD 0 [ 368.804114] Oops: 0002 [#1] SMP [ 368.804143] CPU 1 [ 368.804151] Modules linked in: sg netconsole s3g(PO) uinput joydev hid_multitouch usbhid hid snd_hda_codec_via cpufreq_userspace cpufreq_powersave cpufreq_stats uhci_hcd cpufreq_conservative snd_hda_intel snd_hda_codec snd_hwdep snd_pcm sdhci_pci snd_page_alloc sdhci snd_timer snd psmouse evdev serio_raw pcspkr soundcore xhci_hcd shpchp s3g_drm(O) mvsas mmc_core ahci libahci drm i2c_core acpi_cpufreq mperf video processor button thermal_sys dm_dmirror exfat_fs exfat_core dm_zcache dm_mod padlock_aes aes_generic padlock_sha iscsi_target_mod target_core_mod configfs sswipe libsas libata scsi_transport_sas picdev via_cputemp hwmon_vid fuse parport_pc ppdev lp parport autofs4 ext4 crc16 mbcache jbd2 sd_mod crc_t10dif usb_storage scsi_mod ehci_hcd usbcore usb_common [ 368.804749] [ 368.804764] Pid: 392, comm: kworker/u:3 Tainted: P W O 3.4.87-logicube-ng.22 #1 To be filled by O.E.M. To be filled by O.E.M./EPIA-M920 [ 368.804802] RIP: 0010:[] [] __mutex_lock_common.isra.7+0x9c/0x15b [ 368.804827] RSP: 0018:ffff880117001cc0 EFLAGS: 00010246 [ 368.804842] RAX: 0000000000000000 RBX: ffff8801185030d0 RCX: ffff88008edcb420 [ 368.804857] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff8801185030d4 [ 368.804873] RBP: ffff8801181531c0 R08: 0000000000000020 R09: 00000000fffffffe [ 368.804885] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801185030d4 [ 368.804899] R13: 0000000000000002 R14: ffff880117001fd8 R15: ffff8801185030d8 [ 368.804916] FS: 0000000000000000(0000) GS:ffff88011fc80000(0000) knlGS:0000000000000000 [ 368.804931] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 368.804946] CR2: 0000000000000000 CR3: 000000000160b000 CR4: 00000000000006e0 [ 368.804962] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 368.804978] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 368.804995] Process kworker/u:3 (pid: 392, threadinfo ffff880117000000, task ffff8801181531c0) [ 368.805009] Stack: [ 368.805017] ffff8801185030d8 0000000000000000 ffffffff8161ddf0 ffffffff81056f7c [ 368.805062] 000000000000b503 ffff8801185030d0 ffff880118503000 0000000000000000 [ 368.805100] ffff8801185030d0 ffff8801188b8000 ffff88008edcb420 ffffffff813583ac [ 368.805135] Call Trace: [ 368.805153] [] ? up+0xb/0x33 [ 368.805168] [] ? mutex_lock+0x16/0x25 [ 368.805194] [] ? smp_execute_task+0x4e/0x222 [libsas] [ 368.805217] [] ? sas_find_bcast_dev+0x3c/0x15d [libsas] [ 368.805240] [] ? sas_find_bcast_dev+0x6f/0x15d [libsas] [ 368.805264] [] ? sas_ex_revalidate_domain+0x37/0x2ec [libsas] [ 368.805280] [] ? printk+0x43/0x48 [ 368.805296] [] ? _raw_spin_unlock_irqrestore+0xc/0xd [ 368.805318] [] ? sas_revalidate_domain+0x85/0xb6 [libsas] [ 368.805336] [] ? process_one_work+0x151/0x27c [ 368.805351] [] ? worker_thread+0xbb/0x152 [ 368.805366] [] ? manage_workers.isra.29+0x163/0x163 [ 368.805382] [] ? kthread+0x79/0x81 [ 368.805399] [] ? kernel_thread_helper+0x4/0x10 [ 368.805416] [] ? kthread_flush_work_fn+0x9/0x9 [ 368.805431] [] ? gs_change+0x13/0x13 [ 368.805442] Code: 83 7d 30 63 7e 04 f3 90 eb ab 4c 8d 63 04 4c 8d 7b 08 4c 89 e7 e8 fa 15 00 00 48 8b 43 10 4c 89 3c 24 48 89 63 10 48 89 44 24 08 <48> 89 20 83 c8 ff 48 89 6c 24 10 87 03 ff c8 74 35 4d 89 ee 41 [ 368.805851] RIP [] __mutex_lock_common.isra.7+0x9c/0x15b [ 368.805877] RSP [ 368.805886] CR2: 0000000000000000 [ 368.805899] ---[ end trace b720682065d8f4cc ]--- It's directly caused by 89d3cf6 [SCSI] libsas: add mutex for SMP task execution, but shows a deeper cause: expander functions expect to be able to cast to and treat domain devices as expanders. The correct fix is to only do expander discover when we know we've got an expander device to avoid wrongly casting a non-expander device. Reported-by: Praveen Murali Tested-by: Praveen Murali Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/libsas/sas_discover.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index 62b58d38ce2e..60de66252fa2 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -500,6 +500,7 @@ static void sas_revalidate_domain(struct work_struct *work) struct sas_discovery_event *ev = to_sas_discovery_event(work); struct asd_sas_port *port = ev->port; struct sas_ha_struct *ha = port->ha; + struct domain_device *ddev = port->port_dev; /* prevent revalidation from finding sata links in recovery */ mutex_lock(&ha->disco_mutex); @@ -514,8 +515,9 @@ static void sas_revalidate_domain(struct work_struct *work) SAS_DPRINTK("REVALIDATING DOMAIN on port %d, pid:%d\n", port->id, task_pid_nr(current)); - if (port->port_dev) - res = sas_ex_revalidate_domain(port->port_dev); + if (ddev && (ddev->dev_type == SAS_FANOUT_EXPANDER_DEVICE || + ddev->dev_type == SAS_EDGE_EXPANDER_DEVICE)) + res = sas_ex_revalidate_domain(ddev); SAS_DPRINTK("done REVALIDATING DOMAIN on port %d, pid:%d, res 0x%x\n", port->id, task_pid_nr(current), res); From 26f7f4d46a2cbfa6fbb633d228ec34cf969589d5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 9 Mar 2015 23:11:12 +0200 Subject: [PATCH 057/236] pagemap: do not leak physical addresses to non-privileged userspace commit ab676b7d6fbf4b294bf198fb27ade5b0e865c7ce upstream. As pointed by recent post[1] on exploiting DRAM physical imperfection, /proc/PID/pagemap exposes sensitive information which can be used to do attacks. This disallows anybody without CAP_SYS_ADMIN to read the pagemap. [1] http://googleprojectzero.blogspot.com/2015/03/exploiting-dram-rowhammer-bug-to-gain.html [ Eventually we might want to do anything more finegrained, but for now this is the simple model. - Linus ] Signed-off-by: Kirill A. Shutemov Acked-by: Konstantin Khlebnikov Acked-by: Andy Lutomirski Cc: Pavel Emelyanov Cc: Andrew Morton Cc: Mark Seaborn Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c4b2646b6d7c..c2546717fc2b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1227,6 +1227,9 @@ out: static int pagemap_open(struct inode *inode, struct file *file) { + /* do not disclose physical addresses: attack vector */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " "to stop being page-shift some time soon. See the " "linux/Documentation/vm/pagemap.txt for details.\n"); From aa4d0590720f8ddbdff8946ae526b052fecb3f0a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 26 Feb 2015 07:22:05 +0000 Subject: [PATCH 058/236] crypto: arm/aes update NEON AES module to latest OpenSSL version commit 001eabfd54c0cbf9d7d16264ddc8cc0bee67e3ed upstream. This updates the bit sliced AES module to the latest version in the upstream OpenSSL repository (e620e5ae37bc). This is needed to fix a bug in the XTS decryption path, where data chunked in a certain way could trigger the ciphertext stealing code, which is not supposed to be active in the kernel build (The kernel implementation of XTS only supports round multiples of the AES block size of 16 bytes, whereas the conformant OpenSSL implementation of XTS supports inputs of arbitrary size by applying ciphertext stealing). This is fixed in the upstream version by adding the missing #ifndef XTS_CHAIN_TWEAK around the offending instructions. The upstream code also contains the change applied by Russell to build the code unconditionally, i.e., even if __LINUX_ARM_ARCH__ < 7, but implemented slightly differently. Fixes: e4e7f10bfc40 ("ARM: add support for bit sliced AES using NEON instructions") Reported-by: Adrian Kotelba Signed-off-by: Ard Biesheuvel Tested-by: Milan Broz Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- arch/arm/crypto/aesbs-core.S_shipped | 12 ++++++++---- arch/arm/crypto/bsaes-armv7.pl | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped index 71e5fc7cfb18..1d1800f71c5b 100644 --- a/arch/arm/crypto/aesbs-core.S_shipped +++ b/arch/arm/crypto/aesbs-core.S_shipped @@ -58,14 +58,18 @@ # define VFP_ABI_FRAME 0 # define BSAES_ASM_EXTENDED_KEY # define XTS_CHAIN_TWEAK -# define __ARM_ARCH__ 7 +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 #endif #ifdef __thumb__ # define adrl adr #endif -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + .text .syntax unified @ ARMv7-capable assembler is expected to handle this #ifdef __thumb2__ @@ -74,8 +78,6 @@ .code 32 #endif -.fpu neon - .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: @@ -2095,9 +2097,11 @@ bsaes_xts_decrypt: vld1.8 {q8}, [r0] @ initial tweak adr r2, .Lxts_magic +#ifndef XTS_CHAIN_TWEAK tst r9, #0xf @ if not multiple of 16 it ne @ Thumb2 thing, sanity check in ARM subne r9, #0x10 @ subtract another 16 bytes +#endif subs r9, #0x80 blo .Lxts_dec_short diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl index be068db960ee..a4d3856e7d24 100644 --- a/arch/arm/crypto/bsaes-armv7.pl +++ b/arch/arm/crypto/bsaes-armv7.pl @@ -701,14 +701,18 @@ $code.=<<___; # define VFP_ABI_FRAME 0 # define BSAES_ASM_EXTENDED_KEY # define XTS_CHAIN_TWEAK -# define __ARM_ARCH__ 7 +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 #endif #ifdef __thumb__ # define adrl adr #endif -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + .text .syntax unified @ ARMv7-capable assembler is expected to handle this #ifdef __thumb2__ @@ -717,8 +721,6 @@ $code.=<<___; .code 32 #endif -.fpu neon - .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: @@ -2076,9 +2078,11 @@ bsaes_xts_decrypt: vld1.8 {@XMM[8]}, [r0] @ initial tweak adr $magic, .Lxts_magic +#ifndef XTS_CHAIN_TWEAK tst $len, #0xf @ if not multiple of 16 it ne @ Thumb2 thing, sanity check in ARM subne $len, #0x10 @ subtract another 16 bytes +#endif subs $len, #0x80 blo .Lxts_dec_short From e9b15363c10104a0823e5e2d360eab188f2b122f Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Thu, 12 Mar 2015 09:17:51 +0100 Subject: [PATCH 059/236] crypto: aesni - fix memory usage in GCM decryption commit ccfe8c3f7e52ae83155cb038753f4c75b774ca8a upstream. The kernel crypto API logic requires the caller to provide the length of (ciphertext || authentication tag) as cryptlen for the AEAD decryption operation. Thus, the cipher implementation must calculate the size of the plaintext output itself and cannot simply use cryptlen. The RFC4106 GCM decryption operation tries to overwrite cryptlen memory in req->dst. As the destination buffer for decryption only needs to hold the plaintext memory but cryptlen references the input buffer holding (ciphertext || authentication tag), the assumption of the destination buffer length in RFC4106 GCM operation leads to a too large size. This patch simply uses the already calculated plaintext size. In addition, this patch fixes the offset calculation of the AAD buffer pointer: as mentioned before, cryptlen already includes the size of the tag. Thus, the tag does not need to be added. With the addition, the AAD will be written beyond the already allocated buffer. Note, this fixes a kernel crash that can be triggered from user space via AF_ALG(aead) -- simply use the libkcapi test application from [1] and update it to use rfc4106-gcm-aes. Using [1], the changes were tested using CAVS vectors to demonstrate that the crypto operation still delivers the right results. [1] http://www.chronox.de/libkcapi.html CC: Tadeusz Struk Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- arch/x86/crypto/aesni-intel_glue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 6dfb7d0b139a..6d4fabac779c 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1109,7 +1109,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); if (!src) return -ENOMEM; - assoc = (src + req->cryptlen + auth_tag_len); + assoc = (src + req->cryptlen); scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); scatterwalk_map_and_copy(assoc, req->assoc, 0, req->assoclen, 0); @@ -1134,7 +1134,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) scatterwalk_done(&src_sg_walk, 0, 0); scatterwalk_done(&assoc_sg_walk, 0, 0); } else { - scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1); + scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1); kfree(src); } return retval; From f8ba940346cbd61047effdbe89cd0e13aec18b68 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 13 Mar 2015 09:53:09 +0100 Subject: [PATCH 060/236] x86/fpu: Avoid math_state_restore() without used_math() in __restore_xstate_sig() commit a7c80ebcac3068b1c3cb27d538d29558c30010c8 upstream. math_state_restore() assumes it is called with irqs disabled, but this is not true if the caller is __restore_xstate_sig(). This means that if ia32_fxstate == T and __copy_from_user() fails, __restore_xstate_sig() returns with irqs disabled too. This triggers: BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:41 dump_stack ___might_sleep ? _raw_spin_unlock_irqrestore __might_sleep down_read ? _raw_spin_unlock_irqrestore print_vma_addr signal_fault sys32_rt_sigreturn Change __restore_xstate_sig() to call set_used_math() unconditionally. This avoids enabling and disabling interrupts in math_state_restore(). If copy_from_user() fails, we can simply do fpu_finit() by hand. [ Note: this is only the first step. math_state_restore() should not check used_math(), it should set this flag. While init_fpu() should simply die. ] Signed-off-by: Oleg Nesterov Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Pekka Riikonen Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Suresh Siddha Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150307153844.GB25954@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/xsave.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index dd50e26c58f6..7a09aca4b33a 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -375,7 +375,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) * thread's fpu state, reconstruct fxstate from the fsave * header. Sanitize the copied state etc. */ - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; @@ -389,14 +389,15 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ drop_fpu(tsk); - if (__copy_from_user(xsave, buf_fx, state_size) || + if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { + fpu_finit(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); - set_used_math(); } + set_used_math(); if (use_eager_fpu()) { preempt_disable(); math_state_restore(); From c16957b5ab6c6bc9767f4843cd4729b63d566703 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 13 Mar 2015 09:53:10 +0100 Subject: [PATCH 061/236] x86/fpu: Drop_fpu() should not assume that tsk equals current commit f4c3686386393c120710dd34df2a74183ab805fd upstream. drop_fpu() does clear_used_math() and usually this is correct because tsk == current. However switch_fpu_finish()->restore_fpu_checking() is called before __switch_to() updates the "current_task" variable. If it fails, we will wrongly clear the PF_USED_MATH flag of the previous task. So use clear_stopped_child_used_math() instead. Signed-off-by: Oleg Nesterov Signed-off-by: Borislav Petkov Reviewed-by: Rik van Riel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Pekka Riikonen Cc: Quentin Casasnovas Cc: Suresh Siddha Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150309171041.GB11388@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/fpu-internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index cea1c76d49bf..1ac1c009090d 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -368,7 +368,7 @@ static inline void drop_fpu(struct task_struct *tsk) preempt_disable(); tsk->thread.fpu_counter = 0; __drop_fpu(tsk); - clear_used_math(); + clear_stopped_child_used_math(tsk); preempt_enable(); } From df33f2aeb8963f303f72e891344fde5a7608c3dc Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 5 Mar 2015 09:13:31 +0100 Subject: [PATCH 062/236] x86/vdso: Fix the build on GCC5 commit e893286918d2cde3a94850d8f7101cd1039e0c62 upstream. On gcc5 the kernel does not link: ld: .eh_frame_hdr table[4] FDE at 0000000000000648 overlaps table[5] FDE at 0000000000000670. Because prior GCC versions always emitted NOPs on ALIGN directives, but gcc5 started omitting them. .LSTARTFDEDLSI1 says: /* HACK: The dwarf2 unwind routines will subtract 1 from the return address to get an address in the middle of the presumed call instruction. Since we didn't get here via a call, we need to include the nop before the real start to make up for it. */ .long .LSTART_sigreturn-1-. /* PC-relative start address */ But commit 69d0627a7f6e ("x86 vDSO: reorder vdso32 code") from 2.6.25 replaced .org __kernel_vsyscall+32,0x90 by ALIGN right before __kernel_sigreturn. Of course, ALIGN need not generate any NOP in there. Esp. gcc5 collapses vclock_gettime.o and int80.o together with no generated NOPs as "ALIGN". So fix this by adding to that point at least a single NOP and make the function ALIGN possibly with more NOPs then. Kudos for reporting and diagnosing should go to Richard. Reported-by: Richard Biener Signed-off-by: Jiri Slaby Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1425543211-12542-1-git-send-email-jslaby@suse.cz Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/vdso/vdso32/sigreturn.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S index 31776d0efc8c..d7ec4e251c0a 100644 --- a/arch/x86/vdso/vdso32/sigreturn.S +++ b/arch/x86/vdso/vdso32/sigreturn.S @@ -17,6 +17,7 @@ .text .globl __kernel_sigreturn .type __kernel_sigreturn,@function + nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ ALIGN __kernel_sigreturn: .LSTART_sigreturn: From 26cef841410c707ad5f30398fa423524478ac99c Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 21 Feb 2015 21:03:10 +0200 Subject: [PATCH 063/236] ipvs: add missing ip_vs_pe_put in sync code commit 528c943f3bb919aef75ab2fff4f00176f09a4019 upstream. ip_vs_conn_fill_param_sync() gets in param.pe a module reference for persistence engine from __ip_vs_pe_getbyname() but forgets to put it. Problem occurs in backup for sync protocol v1 (2.6.39). Also, pe_data usually comes in sync messages for connection templates and ip_vs_conn_new() copies the pointer only in this case. Make sure pe_data is not leaked if it comes unexpectedly for normal connections. Leak can happen only if bogus messages are sent to backup server. Fixes: fe5e7a1efb66 ("IPVS: Backup, Adding Version 1 receive capability") Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Greg Kroah-Hartman --- net/netfilter/ipvs/ip_vs_sync.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index db801263ee9f..a8027e73b6a2 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -891,6 +891,8 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + kfree(param->pe_data); } if (opt) @@ -1164,6 +1166,7 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) ); #endif + ip_vs_pe_put(param.pe); return 0; /* Error exit */ out: From 9104f6c68d151932d1fef0cd3cba7280671082d6 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 18 Dec 2014 22:41:23 +0200 Subject: [PATCH 064/236] ipvs: rerouting to local clients is not needed anymore commit 579eb62ac35845686a7c4286c0a820b4eb1f96aa upstream. commit f5a41847acc5 ("ipvs: move ip_route_me_harder for ICMP") from 2.6.37 introduced ip_route_me_harder() call for responses to local clients, so that we can provide valid rt_src after SNAT. It was used by TCP to provide valid daddr for ip_send_reply(). After commit 0a5ebb8000c5 ("ipv4: Pass explicit daddr arg to ip_send_reply()." from 3.0 this rerouting is not needed anymore and should be avoided, especially in LOCAL_IN. Fixes 3.12.33 crash in xfrm reported by Florian Wiessner: "3.12.33 - BUG xfrm_selector_match+0x25/0x2f6" Reported-by: Smart Weblications GmbH - Florian Wiessner Tested-by: Smart Weblications GmbH - Florian Wiessner Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Greg Kroah-Hartman --- net/netfilter/ipvs/ip_vs_core.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 27d3f40de3cd..847d2a2c5d05 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -658,16 +658,24 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } -static int ip_vs_route_me_harder(int af, struct sk_buff *skb) +static int ip_vs_route_me_harder(int af, struct sk_buff *skb, + unsigned int hooknum) { + if (!sysctl_snat_reroute(skb)) + return 0; + /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ + if (NF_INET_LOCAL_IN == hooknum) + return 0; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) + struct dst_entry *dst = skb_dst(skb); + + if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && + ip6_route_me_harder(skb) != 0) return 1; } else #endif - if ((sysctl_snat_reroute(skb) || - skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(skb, RTN_LOCAL) != 0) return 1; @@ -790,7 +798,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, union nf_inet_addr *snet, __u8 protocol, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, - unsigned int offset, unsigned int ihl) + unsigned int offset, unsigned int ihl, + unsigned int hooknum) { unsigned int verdict = NF_DROP; @@ -820,7 +829,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); - if (ip_vs_route_me_harder(af, skb)) + if (ip_vs_route_me_harder(af, skb, hooknum)) goto out; /* do the statistics and put it back */ @@ -915,7 +924,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, - pp, ciph.len, ihl); + pp, ciph.len, ihl, hooknum); } #ifdef CONFIG_IP_VS_IPV6 @@ -980,7 +989,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, snet.in6 = ciph.saddr.in6; writable = ciph.len; return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, - pp, writable, sizeof(struct ipv6hdr)); + pp, writable, sizeof(struct ipv6hdr), + hooknum); } #endif @@ -1039,7 +1049,8 @@ static inline bool is_new_conn(const struct sk_buff *skb, */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, + unsigned int hooknum) { struct ip_vs_protocol *pp = pd->pp; @@ -1077,7 +1088,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * if it came from this machine itself. So re-compute * the routing information. */ - if (ip_vs_route_me_harder(af, skb)) + if (ip_vs_route_me_harder(af, skb, hooknum)) goto drop; IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); @@ -1180,7 +1191,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) cp = pp->conn_out_get(af, skb, &iph, 0); if (likely(cp)) - return handle_response(af, skb, pd, cp, &iph); + return handle_response(af, skb, pd, cp, &iph, hooknum); if (sysctl_nat_icmp_send(net) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || From 17ad69a74cea38b94148cb7449d9a6c682d04444 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 12 Feb 2015 22:15:31 +0100 Subject: [PATCH 065/236] netfilter: nft_compat: fix module refcount underflow commit 520aa7414bb590f39d0d1591b06018e60cbc7cf4 upstream. Feb 12 18:20:42 nfdev kernel: ------------[ cut here ]------------ Feb 12 18:20:42 nfdev kernel: WARNING: CPU: 4 PID: 4359 at kernel/module.c:963 module_put+0x9b/0xba() Feb 12 18:20:42 nfdev kernel: CPU: 4 PID: 4359 Comm: ebtables-compat Tainted: G W 3.19.0-rc6+ #43 [...] Feb 12 18:20:42 nfdev kernel: Call Trace: Feb 12 18:20:42 nfdev kernel: [] dump_stack+0x4c/0x65 Feb 12 18:20:42 nfdev kernel: [] warn_slowpath_common+0x9c/0xb6 Feb 12 18:20:42 nfdev kernel: [] ? module_put+0x9b/0xba Feb 12 18:20:42 nfdev kernel: [] warn_slowpath_null+0x15/0x17 Feb 12 18:20:42 nfdev kernel: [] module_put+0x9b/0xba Feb 12 18:20:42 nfdev kernel: [] nft_match_destroy+0x45/0x4c Feb 12 18:20:42 nfdev kernel: [] nf_tables_rule_destroy+0x28/0x70 Reported-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso Tested-by: Arturo Borrero Gonzalez Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nft_compat.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index ad979612238a..7350723aeb15 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -611,8 +611,12 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct xt_match *match = nft_match->ops.data; if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) + match->revision == rev && match->family == family) { + if (!try_module_get(match->me)) + return ERR_PTR(-ENOENT); + return &nft_match->ops; + } } match = xt_request_find_match(family, mt_name, rev); @@ -682,8 +686,12 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct xt_target *target = nft_target->ops.data; if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) + target->revision == rev && target->family == family) { + if (!try_module_get(target->me)) + return ERR_PTR(-ENOENT); + return &nft_target->ops; + } } target = xt_request_find_target(family, tg_name, rev); From 2e1a691e057140b6b559e885b2876581dc249489 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Feb 2015 19:03:45 -0800 Subject: [PATCH 066/236] netfilter: xt_socket: fix a stack corruption bug commit 78296c97ca1fd3b104f12e1f1fbc06c46635990b upstream. As soon as extract_icmp6_fields() returns, its local storage (automatic variables) is deallocated and can be overwritten. Lets add an additional parameter to make sure storage is valid long enough. While we are at it, adds some const qualifiers. Signed-off-by: Eric Dumazet Fixes: b64c9256a9b76 ("tproxy: added IPv6 support to the socket match") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- net/netfilter/xt_socket.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 1ba67931eb1b..13332dbf291d 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -243,12 +243,13 @@ static int extract_icmp6_fields(const struct sk_buff *skb, unsigned int outside_hdrlen, int *protocol, - struct in6_addr **raddr, - struct in6_addr **laddr, + const struct in6_addr **raddr, + const struct in6_addr **laddr, __be16 *rport, - __be16 *lport) + __be16 *lport, + struct ipv6hdr *ipv6_var) { - struct ipv6hdr *inside_iph, _inside_iph; + const struct ipv6hdr *inside_iph; struct icmp6hdr *icmph, _icmph; __be16 *ports, _ports[2]; u8 inside_nexthdr; @@ -263,12 +264,14 @@ extract_icmp6_fields(const struct sk_buff *skb, if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK) return 1; - inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph); + inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), + sizeof(*ipv6_var), ipv6_var); if (inside_iph == NULL) return 1; inside_nexthdr = inside_iph->nexthdr; - inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), + inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + + sizeof(*ipv6_var), &inside_nexthdr, &inside_fragoff); if (inside_hdrlen < 0) return 1; /* hjm: Packet has no/incomplete transport layer headers. */ @@ -315,10 +318,10 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol, static bool socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr ipv6_var, *iph = ipv6_hdr(skb); struct udphdr _hdr, *hp = NULL; struct sock *sk = skb->sk; - struct in6_addr *daddr = NULL, *saddr = NULL; + const struct in6_addr *daddr = NULL, *saddr = NULL; __be16 uninitialized_var(dport), uninitialized_var(sport); int thoff = 0, uninitialized_var(tproto); const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; @@ -342,7 +345,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) } else if (tproto == IPPROTO_ICMPV6) { if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, - &sport, &dport)) + &sport, &dport, &ipv6_var)) return false; } else { return false; From 78b226ff98dcf4f5bd6bb696f3305df19eefdc8f Mon Sep 17 00:00:00 2001 From: "Suzuki K. Poulose" Date: Thu, 19 Mar 2015 18:17:09 +0000 Subject: [PATCH 067/236] arm64: Honor __GFP_ZERO in dma allocations commit 7132813c384515c9dede1ae20e56f3895feb7f1e upstream. Current implementation doesn't zero out the pages allocated. Honor the __GFP_ZERO flag and zero out if set. Cc: # v3.14+ Acked-by: Will Deacon Signed-off-by: Suzuki K. Poulose Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/mm/dma-mapping.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index fbd76785c5db..3974881388bb 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -44,6 +44,7 @@ static void *arm64_swiotlb_alloc_coherent(struct device *dev, size_t size, flags |= GFP_DMA32; if (IS_ENABLED(CONFIG_DMA_CMA)) { struct page *page; + void *addr; size = PAGE_ALIGN(size); page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, @@ -52,7 +53,10 @@ static void *arm64_swiotlb_alloc_coherent(struct device *dev, size_t size, return NULL; *dma_handle = phys_to_dma(dev, page_to_phys(page)); - return page_address(page); + addr = page_address(page); + if (flags & __GFP_ZERO) + memset(addr, 0, size); + return addr; } else { return swiotlb_alloc_coherent(dev, size, dma_handle, flags); } From 7176ffe49358668b0db56b2994d072e3329693bc Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 3 Mar 2015 19:58:22 +0100 Subject: [PATCH 068/236] ARM: at91: pm: fix at91rm9200 standby commit 84e871660bebfddb9a62ebd6f19d02536e782f0a upstream. at91rm9200 standby and suspend to ram has been broken since 00482a4078f4. It is wrongly using AT91_BASE_SYS which is a physical address and actually doesn't correspond to any register on at91rm9200. Use the correct at91_ramc_base[0] instead. Fixes: 00482a4078f4 (ARM: at91: implement the standby function for pm/cpuidle) Signed-off-by: Alexandre Belloni Signed-off-by: Nicolas Ferre Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-at91/pm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-at91/pm.h b/arch/arm/mach-at91/pm.h index c5101dcb4fb0..1d4df3b70ebc 100644 --- a/arch/arm/mach-at91/pm.h +++ b/arch/arm/mach-at91/pm.h @@ -45,7 +45,7 @@ static inline void at91rm9200_standby(void) " mcr p15, 0, %0, c7, c0, 4\n\t" " str %5, [%1, %2]" : - : "r" (0), "r" (AT91_BASE_SYS), "r" (AT91RM9200_SDRAMC_LPR), + : "r" (0), "r" (at91_ramc_base[0]), "r" (AT91RM9200_SDRAMC_LPR), "r" (1), "r" (AT91RM9200_SDRAMC_SRR), "r" (lpr)); } From 774df9928fd150d9d7cc0c1924ece04591cb4652 Mon Sep 17 00:00:00 2001 From: Ravikumar Kattekola Date: Sat, 31 Jan 2015 22:36:44 +0530 Subject: [PATCH 069/236] ARM: dts: DRA7x: Fix the bypass clock source for dpll_iva and others commit d2192ea09858a8535b056fcede1a41d824e0b3d8 upstream. Fixes: ee6c750761 (ARM: dts: dra7 clock data) On DRA7x, For DPLL_IVA, the ref clock(CLKINP) is connected to sys_clk1 and the bypass input(CLKINPULOW) is connected to iva_dpll_hs_clk_div clock. But the bypass input is not directly routed to bypass clkout instead both CLKINP and CLKINPULOW are connected to bypass clkout via a mux. This mux is controlled by the bit - CM_CLKSEL_DPLL_IVA[23]:DPLL_BYP_CLKSEL and it's POR value is zero which selects the CLKINP as bypass clkout. which means iva_dpll_hs_clk_div is not the bypass clock for dpll_iva_ck Fix this by adding another mux clock as parent in bypass mode. This design is common to most of the PLLs and the rest have only one bypass clock. Below is a list of the DPLLs that need this fix: DPLL_IVA, DPLL_DDR, DPLL_DSP, DPLL_EVE, DPLL_GMAC, DPLL_PER, DPLL_USB and DPLL_CORE Signed-off-by: Ravikumar Kattekola Acked-by: Tero Kristo Signed-off-by: Tony Lindgren Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/dra7xx-clocks.dtsi | 90 +++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 9 deletions(-) diff --git a/arch/arm/boot/dts/dra7xx-clocks.dtsi b/arch/arm/boot/dts/dra7xx-clocks.dtsi index e96da9a898ad..f2512e1d28c7 100644 --- a/arch/arm/boot/dts/dra7xx-clocks.dtsi +++ b/arch/arm/boot/dts/dra7xx-clocks.dtsi @@ -243,10 +243,18 @@ ti,invert-autoidle-bit; }; + dpll_core_byp_mux: dpll_core_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&dpll_abe_m3x2_ck>; + ti,bit-shift = <23>; + reg = <0x012c>; + }; + dpll_core_ck: dpll_core_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-core-clock"; - clocks = <&sys_clkin1>, <&dpll_abe_m3x2_ck>; + clocks = <&sys_clkin1>, <&dpll_core_byp_mux>; reg = <0x0120>, <0x0124>, <0x012c>, <0x0128>; }; @@ -309,10 +317,18 @@ clock-div = <1>; }; + dpll_dsp_byp_mux: dpll_dsp_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&dsp_dpll_hs_clk_div>; + ti,bit-shift = <23>; + reg = <0x0240>; + }; + dpll_dsp_ck: dpll_dsp_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-clock"; - clocks = <&sys_clkin1>, <&dsp_dpll_hs_clk_div>; + clocks = <&sys_clkin1>, <&dpll_dsp_byp_mux>; reg = <0x0234>, <0x0238>, <0x0240>, <0x023c>; }; @@ -335,10 +351,18 @@ clock-div = <1>; }; + dpll_iva_byp_mux: dpll_iva_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&iva_dpll_hs_clk_div>; + ti,bit-shift = <23>; + reg = <0x01ac>; + }; + dpll_iva_ck: dpll_iva_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-clock"; - clocks = <&sys_clkin1>, <&iva_dpll_hs_clk_div>; + clocks = <&sys_clkin1>, <&dpll_iva_byp_mux>; reg = <0x01a0>, <0x01a4>, <0x01ac>, <0x01a8>; }; @@ -361,10 +385,18 @@ clock-div = <1>; }; + dpll_gpu_byp_mux: dpll_gpu_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&dpll_abe_m3x2_ck>; + ti,bit-shift = <23>; + reg = <0x02e4>; + }; + dpll_gpu_ck: dpll_gpu_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-clock"; - clocks = <&sys_clkin1>, <&dpll_abe_m3x2_ck>; + clocks = <&sys_clkin1>, <&dpll_gpu_byp_mux>; reg = <0x02d8>, <0x02dc>, <0x02e4>, <0x02e0>; }; @@ -398,10 +430,18 @@ clock-div = <1>; }; + dpll_ddr_byp_mux: dpll_ddr_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&dpll_abe_m3x2_ck>; + ti,bit-shift = <23>; + reg = <0x021c>; + }; + dpll_ddr_ck: dpll_ddr_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-clock"; - clocks = <&sys_clkin1>, <&dpll_abe_m3x2_ck>; + clocks = <&sys_clkin1>, <&dpll_ddr_byp_mux>; reg = <0x0210>, <0x0214>, <0x021c>, <0x0218>; }; @@ -416,10 +456,18 @@ ti,invert-autoidle-bit; }; + dpll_gmac_byp_mux: dpll_gmac_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&dpll_abe_m3x2_ck>; + ti,bit-shift = <23>; + reg = <0x02b4>; + }; + dpll_gmac_ck: dpll_gmac_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-clock"; - clocks = <&sys_clkin1>, <&dpll_abe_m3x2_ck>; + clocks = <&sys_clkin1>, <&dpll_gmac_byp_mux>; reg = <0x02a8>, <0x02ac>, <0x02b4>, <0x02b0>; }; @@ -482,10 +530,18 @@ clock-div = <1>; }; + dpll_eve_byp_mux: dpll_eve_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&eve_dpll_hs_clk_div>; + ti,bit-shift = <23>; + reg = <0x0290>; + }; + dpll_eve_ck: dpll_eve_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-clock"; - clocks = <&sys_clkin1>, <&eve_dpll_hs_clk_div>; + clocks = <&sys_clkin1>, <&dpll_eve_byp_mux>; reg = <0x0284>, <0x0288>, <0x0290>, <0x028c>; }; @@ -1214,10 +1270,18 @@ clock-div = <1>; }; + dpll_per_byp_mux: dpll_per_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&per_dpll_hs_clk_div>; + ti,bit-shift = <23>; + reg = <0x014c>; + }; + dpll_per_ck: dpll_per_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-clock"; - clocks = <&sys_clkin1>, <&per_dpll_hs_clk_div>; + clocks = <&sys_clkin1>, <&dpll_per_byp_mux>; reg = <0x0140>, <0x0144>, <0x014c>, <0x0148>; }; @@ -1240,10 +1304,18 @@ clock-div = <1>; }; + dpll_usb_byp_mux: dpll_usb_byp_mux { + #clock-cells = <0>; + compatible = "ti,mux-clock"; + clocks = <&sys_clkin1>, <&usb_dpll_hs_clk_div>; + ti,bit-shift = <23>; + reg = <0x018c>; + }; + dpll_usb_ck: dpll_usb_ck { #clock-cells = <0>; compatible = "ti,omap4-dpll-j-type-clock"; - clocks = <&sys_clkin1>, <&usb_dpll_hs_clk_div>; + clocks = <&sys_clkin1>, <&dpll_usb_byp_mux>; reg = <0x0180>, <0x0184>, <0x018c>, <0x0188>; }; From e63b72ed8696d24da05d4f9fd9605030ae6597a7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 18 Feb 2015 15:33:58 +0100 Subject: [PATCH 070/236] target: Fix reference leak in target_get_sess_cmd() error path commit 7544e597343e2166daba3f32e4708533aa53c233 upstream. This patch fixes a se_cmd->cmd_kref leak buf when se_sess->sess_tearing_down is true within target_get_sess_cmd() submission path code. This se_cmd reference leak can occur during active session shutdown when ack_kref=1 is passed by target_submit_cmd_[map_sgls,tmr]() callers. Signed-off-by: Bart Van Assche Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_transport.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index e6463ef33cd2..9e54c0fe718d 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2327,6 +2327,10 @@ int target_get_sess_cmd(struct se_session *se_sess, struct se_cmd *se_cmd, list_add_tail(&se_cmd->se_cmd_list, &se_sess->sess_cmd_list); out: spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags); + + if (ret && ack_kref) + target_put_sess_cmd(se_sess, se_cmd); + return ret; } EXPORT_SYMBOL(target_get_sess_cmd); From 6276171b45e2150732a58a5707d9bc0d744927e5 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 5 Mar 2015 03:28:24 +0000 Subject: [PATCH 071/236] target: Fix virtual LUN=0 target_configure_device failure OOPs commit 5f7da044f8bc1cfb21c962edf34bd5699a76e7ae upstream. This patch fixes a NULL pointer dereference triggered by a late target_configure_device() -> alloc_workqueue() failure that results in target_free_device() being called with DF_CONFIGURED already set, which subsequently OOPses in destroy_workqueue() code. Currently this only happens at modprobe target_core_mod time when core_dev_setup_virtual_lun0() -> target_configure_device() fails, and the explicit target_free_device() gets called. To address this bug originally introduced by commit 0fd97ccf45, go ahead and move DF_CONFIGURED to end of target_configure_device() code to handle this special failure case. Reported-by: Claudio Fleiner Cc: Claudio Fleiner Cc: Christoph Hellwig Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 26ae6886ac59..093b8cb85de7 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -1591,8 +1591,6 @@ int target_configure_device(struct se_device *dev) ret = dev->transport->configure_device(dev); if (ret) goto out; - dev->dev_flags |= DF_CONFIGURED; - /* * XXX: there is not much point to have two different values here.. */ @@ -1654,6 +1652,8 @@ int target_configure_device(struct se_device *dev) list_add_tail(&dev->g_dev_node, &g_device_list); mutex_unlock(&g_device_mutex); + dev->dev_flags |= DF_CONFIGURED; + return 0; out_free_alua: From bcbd8cdef07b7f452bb795ed80d106609d634431 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 23 Feb 2015 00:57:51 -0800 Subject: [PATCH 072/236] iscsi-target: Avoid early conn_logout_comp for iser connections commit f068fbc82e7696d67b1bb8189306865bedf368b6 upstream. This patch fixes a iser specific logout bug where early complete() of conn->conn_logout_comp in iscsit_close_connection() was causing isert_wait4logout() to complete too soon, triggering a use after free NULL pointer dereference of iscsi_conn memory. The complete() was originally added for traditional iscsi-target when a ISCSI_LOGOUT_OP failed in iscsi_target_rx_opcode(), but given iser-target does not wait in logout failure, this special case needs to be avoided. Reported-by: Sagi Grimberg Cc: Sagi Grimberg Cc: Slava Shwartsman Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 104f29e6b290..e168a63e77ea 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -4196,11 +4196,17 @@ int iscsit_close_connection( pr_debug("Closing iSCSI connection CID %hu on SID:" " %u\n", conn->cid, sess->sid); /* - * Always up conn_logout_comp just in case the RX Thread is sleeping - * and the logout response never got sent because the connection - * failed. + * Always up conn_logout_comp for the traditional TCP case just in case + * the RX Thread in iscsi_target_rx_opcode() is sleeping and the logout + * response never got sent because the connection failed. + * + * However for iser-target, isert_wait4logout() is using conn_logout_comp + * to signal logout response TX interrupt completion. Go ahead and skip + * this for iser since isert_rx_opcode() does not wait on logout failure, + * and to avoid iscsi_conn pointer dereference in iser-target code. */ - complete(&conn->conn_logout_comp); + if (conn->conn_transport->transport_type == ISCSI_TCP) + complete(&conn->conn_logout_comp); iscsi_release_thread_set(conn); From d5b825cd6eca3e370760d2d7abf34d531552f31d Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 27 Feb 2015 03:54:13 -0800 Subject: [PATCH 073/236] target/pscsi: Fix NULL pointer dereference in get_device_type commit 215a8fe4198f607f34ecdbc9969dae783d8b5a61 upstream. This patch fixes a NULL pointer dereference OOPs with pSCSI backends within target_core_stat.c code. The bug is caused by a configfs attr read if no pscsi_dev_virt->pdv_sd has been configured. Reported-by: Olaf Hering Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_pscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 0f199f6a0738..29f28808fc03 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1111,7 +1111,7 @@ static u32 pscsi_get_device_type(struct se_device *dev) struct pscsi_dev_virt *pdv = PSCSI_DEV(dev); struct scsi_device *sd = pdv->pdv_sd; - return sd->type; + return (sd) ? sd->type : TYPE_NO_LUN; } static sector_t pscsi_get_blocks(struct se_device *dev) From 3b8bcf0a4541ebb0dbac921af866ea6b26b5d753 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sun, 14 Dec 2014 01:47:19 -0800 Subject: [PATCH 074/236] target: Fix R_HOLDER bit usage for AllRegistrants commit d16ca7c5198fd668db10d2c7b048ed3359c12c54 upstream. This patch fixes the usage of R_HOLDER bit for an All Registrants reservation in READ_FULL_STATUS, where only the registration who issued RESERVE was being reported as having an active reservation. It changes core_scsi3_pri_read_full_status() to check ahead of the list walk of active registrations to see if All Registrants is active, and if so set R_HOLDER bit and scope/type fields for all active registrations. Reported-by: Ilias Tsitsimpis Cc: James Bottomley Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_pr.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 0fccdcfd1015..6bec5d1dfbb1 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -3855,7 +3855,8 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) unsigned char *buf; u32 add_desc_len = 0, add_len = 0, desc_len, exp_desc_len; u32 off = 8; /* off into first Full Status descriptor */ - int format_code = 0; + int format_code = 0, pr_res_type = 0, pr_res_scope = 0; + bool all_reg = false; if (cmd->data_length < 8) { pr_err("PRIN SA READ_FULL_STATUS SCSI Data Length: %u" @@ -3872,6 +3873,19 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) buf[2] = ((dev->t10_pr.pr_generation >> 8) & 0xff); buf[3] = (dev->t10_pr.pr_generation & 0xff); + spin_lock(&dev->dev_reservation_lock); + if (dev->dev_pr_res_holder) { + struct t10_pr_registration *pr_holder = dev->dev_pr_res_holder; + + if (pr_holder->pr_res_type == PR_TYPE_WRITE_EXCLUSIVE_ALLREG || + pr_holder->pr_res_type == PR_TYPE_EXCLUSIVE_ACCESS_ALLREG) { + all_reg = true; + pr_res_type = pr_holder->pr_res_type; + pr_res_scope = pr_holder->pr_res_scope; + } + } + spin_unlock(&dev->dev_reservation_lock); + spin_lock(&pr_tmpl->registration_lock); list_for_each_entry_safe(pr_reg, pr_reg_tmp, &pr_tmpl->registration_list, pr_reg_list) { @@ -3921,14 +3935,20 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) * reservation holder for PR_HOLDER bit. * * Also, if this registration is the reservation - * holder, fill in SCOPE and TYPE in the next byte. + * holder or there is an All Registrants reservation + * active, fill in SCOPE and TYPE in the next byte. */ if (pr_reg->pr_res_holder) { buf[off++] |= 0x01; buf[off++] = (pr_reg->pr_res_scope & 0xf0) | (pr_reg->pr_res_type & 0x0f); - } else + } else if (all_reg) { + buf[off++] |= 0x01; + buf[off++] = (pr_res_scope & 0xf0) | + (pr_res_type & 0x0f); + } else { off += 2; + } off += 4; /* Skip over reserved area */ /* From dc366beadc4ef90c86082eb80b6cd772da1f2113 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 15 Dec 2014 11:50:26 -0800 Subject: [PATCH 075/236] target: Avoid dropping AllRegistrants reservation during unregister commit 6c3c9baa0debeb4bcc52a78c4463a0a97518de10 upstream. This patch fixes an issue with AllRegistrants reservations where an unregister operation by the I_T nexus reservation holder would incorrectly drop the reservation, instead of waiting until the last active I_T nexus is unregistered as per SPC-4. This includes updating __core_scsi3_complete_pro_release() to reset dev->dev_pr_res_holder with another pr_reg for this special case, as well as a new 'unreg' parameter to determine when the release is occuring from an implicit unregister, vs. explicit RELEASE. It also adds special handling in core_scsi3_free_pr_reg_from_nacl() to release the left-over pr_res_holder, now that pr_reg is deleted from pr_reg_list within __core_scsi3_complete_pro_release(). Reported-by: Ilias Tsitsimpis Cc: James Bottomley Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_pr.c | 87 ++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 22 deletions(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 6bec5d1dfbb1..2aab21baedc1 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -76,7 +76,7 @@ enum preempt_type { }; static void __core_scsi3_complete_pro_release(struct se_device *, struct se_node_acl *, - struct t10_pr_registration *, int); + struct t10_pr_registration *, int, int); static sense_reason_t target_scsi2_reservation_check(struct se_cmd *cmd) @@ -1186,7 +1186,7 @@ static int core_scsi3_check_implicit_release( * service action with the SERVICE ACTION RESERVATION KEY * field set to zero (see 5.7.11.3). */ - __core_scsi3_complete_pro_release(dev, nacl, pr_reg, 0); + __core_scsi3_complete_pro_release(dev, nacl, pr_reg, 0, 1); ret = 1; /* * For 'All Registrants' reservation types, all existing @@ -1228,7 +1228,8 @@ static void __core_scsi3_free_registration( pr_reg->pr_reg_deve->def_pr_registered = 0; pr_reg->pr_reg_deve->pr_res_key = 0; - list_del(&pr_reg->pr_reg_list); + if (!list_empty(&pr_reg->pr_reg_list)) + list_del(&pr_reg->pr_reg_list); /* * Caller accessing *pr_reg using core_scsi3_locate_pr_reg(), * so call core_scsi3_put_pr_reg() to decrement our reference. @@ -1280,6 +1281,7 @@ void core_scsi3_free_pr_reg_from_nacl( { struct t10_reservation *pr_tmpl = &dev->t10_pr; struct t10_pr_registration *pr_reg, *pr_reg_tmp, *pr_res_holder; + bool free_reg = false; /* * If the passed se_node_acl matches the reservation holder, * release the reservation. @@ -1287,13 +1289,18 @@ void core_scsi3_free_pr_reg_from_nacl( spin_lock(&dev->dev_reservation_lock); pr_res_holder = dev->dev_pr_res_holder; if ((pr_res_holder != NULL) && - (pr_res_holder->pr_reg_nacl == nacl)) - __core_scsi3_complete_pro_release(dev, nacl, pr_res_holder, 0); + (pr_res_holder->pr_reg_nacl == nacl)) { + __core_scsi3_complete_pro_release(dev, nacl, pr_res_holder, 0, 1); + free_reg = true; + } spin_unlock(&dev->dev_reservation_lock); /* * Release any registration associated with the struct se_node_acl. */ spin_lock(&pr_tmpl->registration_lock); + if (pr_res_holder && free_reg) + __core_scsi3_free_registration(dev, pr_res_holder, NULL, 0); + list_for_each_entry_safe(pr_reg, pr_reg_tmp, &pr_tmpl->registration_list, pr_reg_list) { @@ -1316,7 +1323,7 @@ void core_scsi3_free_all_registrations( if (pr_res_holder != NULL) { struct se_node_acl *pr_res_nacl = pr_res_holder->pr_reg_nacl; __core_scsi3_complete_pro_release(dev, pr_res_nacl, - pr_res_holder, 0); + pr_res_holder, 0, 0); } spin_unlock(&dev->dev_reservation_lock); @@ -2126,13 +2133,13 @@ core_scsi3_emulate_pro_register(struct se_cmd *cmd, u64 res_key, u64 sa_res_key, /* * sa_res_key=0 Unregister Reservation Key for registered I_T Nexus. */ - pr_holder = core_scsi3_check_implicit_release( - cmd->se_dev, pr_reg); + type = pr_reg->pr_res_type; + pr_holder = core_scsi3_check_implicit_release(cmd->se_dev, + pr_reg); if (pr_holder < 0) { ret = TCM_RESERVATION_CONFLICT; goto out; } - type = pr_reg->pr_res_type; spin_lock(&pr_tmpl->registration_lock); /* @@ -2406,23 +2413,59 @@ static void __core_scsi3_complete_pro_release( struct se_device *dev, struct se_node_acl *se_nacl, struct t10_pr_registration *pr_reg, - int explicit) + int explicit, + int unreg) { struct target_core_fabric_ops *tfo = se_nacl->se_tpg->se_tpg_tfo; char i_buf[PR_REG_ISID_ID_LEN]; + int pr_res_type = 0, pr_res_scope = 0; memset(i_buf, 0, PR_REG_ISID_ID_LEN); core_pr_dump_initiator_port(pr_reg, i_buf, PR_REG_ISID_ID_LEN); /* * Go ahead and release the current PR reservation holder. + * If an All Registrants reservation is currently active and + * a unregister operation is requested, replace the current + * dev_pr_res_holder with another active registration. */ - dev->dev_pr_res_holder = NULL; + if (dev->dev_pr_res_holder) { + pr_res_type = dev->dev_pr_res_holder->pr_res_type; + pr_res_scope = dev->dev_pr_res_holder->pr_res_scope; + dev->dev_pr_res_holder->pr_res_type = 0; + dev->dev_pr_res_holder->pr_res_scope = 0; + dev->dev_pr_res_holder->pr_res_holder = 0; + dev->dev_pr_res_holder = NULL; + } + if (!unreg) + goto out; - pr_debug("SPC-3 PR [%s] Service Action: %s RELEASE cleared" - " reservation holder TYPE: %s ALL_TG_PT: %d\n", - tfo->get_fabric_name(), (explicit) ? "explicit" : "implicit", - core_scsi3_pr_dump_type(pr_reg->pr_res_type), - (pr_reg->pr_reg_all_tg_pt) ? 1 : 0); + spin_lock(&dev->t10_pr.registration_lock); + list_del_init(&pr_reg->pr_reg_list); + /* + * If the I_T nexus is a reservation holder, the persistent reservation + * is of an all registrants type, and the I_T nexus is the last remaining + * registered I_T nexus, then the device server shall also release the + * persistent reservation. + */ + if (!list_empty(&dev->t10_pr.registration_list) && + ((pr_res_type == PR_TYPE_WRITE_EXCLUSIVE_ALLREG) || + (pr_res_type == PR_TYPE_EXCLUSIVE_ACCESS_ALLREG))) { + dev->dev_pr_res_holder = + list_entry(dev->t10_pr.registration_list.next, + struct t10_pr_registration, pr_reg_list); + dev->dev_pr_res_holder->pr_res_type = pr_res_type; + dev->dev_pr_res_holder->pr_res_scope = pr_res_scope; + dev->dev_pr_res_holder->pr_res_holder = 1; + } + spin_unlock(&dev->t10_pr.registration_lock); +out: + if (!dev->dev_pr_res_holder) { + pr_debug("SPC-3 PR [%s] Service Action: %s RELEASE cleared" + " reservation holder TYPE: %s ALL_TG_PT: %d\n", + tfo->get_fabric_name(), (explicit) ? "explicit" : + "implicit", core_scsi3_pr_dump_type(pr_res_type), + (pr_reg->pr_reg_all_tg_pt) ? 1 : 0); + } pr_debug("SPC-3 PR [%s] RELEASE Node: %s%s\n", tfo->get_fabric_name(), se_nacl->initiatorname, i_buf); @@ -2553,7 +2596,7 @@ core_scsi3_emulate_pro_release(struct se_cmd *cmd, int type, int scope, * server shall not establish a unit attention condition. */ __core_scsi3_complete_pro_release(dev, se_sess->se_node_acl, - pr_reg, 1); + pr_reg, 1, 0); spin_unlock(&dev->dev_reservation_lock); @@ -2641,7 +2684,7 @@ core_scsi3_emulate_pro_clear(struct se_cmd *cmd, u64 res_key) if (pr_res_holder) { struct se_node_acl *pr_res_nacl = pr_res_holder->pr_reg_nacl; __core_scsi3_complete_pro_release(dev, pr_res_nacl, - pr_res_holder, 0); + pr_res_holder, 0, 0); } spin_unlock(&dev->dev_reservation_lock); /* @@ -2700,7 +2743,7 @@ static void __core_scsi3_complete_pro_preempt( */ if (dev->dev_pr_res_holder) __core_scsi3_complete_pro_release(dev, nacl, - dev->dev_pr_res_holder, 0); + dev->dev_pr_res_holder, 0, 0); dev->dev_pr_res_holder = pr_reg; pr_reg->pr_res_holder = 1; @@ -2944,8 +2987,8 @@ core_scsi3_pro_preempt(struct se_cmd *cmd, int type, int scope, u64 res_key, */ if (pr_reg_n != pr_res_holder) __core_scsi3_complete_pro_release(dev, - pr_res_holder->pr_reg_nacl, - dev->dev_pr_res_holder, 0); + pr_res_holder->pr_reg_nacl, + dev->dev_pr_res_holder, 0, 0); /* * b) Remove the registrations for all I_T nexuses identified * by the SERVICE ACTION RESERVATION KEY field, except the @@ -3415,7 +3458,7 @@ after_iport_check: * holder (i.e., the I_T nexus on which the */ __core_scsi3_complete_pro_release(dev, pr_res_nacl, - dev->dev_pr_res_holder, 0); + dev->dev_pr_res_holder, 0, 0); /* * g) Move the persistent reservation to the specified I_T nexus using * the same scope and type as the persistent reservation released in From c3d6b5796b22abf78007bd6bb196d1f95a29ac19 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 19 Dec 2014 00:49:23 +0000 Subject: [PATCH 076/236] target: Allow AllRegistrants to re-RESERVE existing reservation commit ae450e246e8540300699480a3780a420a028b73f upstream. This patch changes core_scsi3_pro_release() logic to allow an existing AllRegistrants type reservation to be re-reserved by any registered I_T nexus. This addresses a issue where AllRegistrants type RESERVE was receiving RESERVATION_CONFLICT status if dev_pr_res_holder did not match the same I_T nexus, instead of just returning GOOD status following spc4r34 Section 5.9.9: "If the device server receives a PERSISTENT RESERVE OUT command with RESERVE service action where the TYPE field and the SCOPE field contain the same values as the existing type and scope from a persistent reservation holder, it shall not make any change to the existing persistent reservation and shall complete the command with GOOD status." Reported-by: Ilias Tsitsimpis Cc: Ilias Tsitsimpis Cc: Lee Duncan Cc: James Bottomley Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_pr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 2aab21baedc1..41a6bfaa22d8 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -2297,6 +2297,7 @@ core_scsi3_pro_reserve(struct se_cmd *cmd, int type, int scope, u64 res_key) spin_lock(&dev->dev_reservation_lock); pr_res_holder = dev->dev_pr_res_holder; if (pr_res_holder) { + int pr_res_type = pr_res_holder->pr_res_type; /* * From spc4r17 Section 5.7.9: Reserving: * @@ -2307,7 +2308,9 @@ core_scsi3_pro_reserve(struct se_cmd *cmd, int type, int scope, u64 res_key) * the logical unit, then the command shall be completed with * RESERVATION CONFLICT status. */ - if (pr_res_holder != pr_reg) { + if ((pr_res_holder != pr_reg) && + (pr_res_type != PR_TYPE_WRITE_EXCLUSIVE_ALLREG) && + (pr_res_type != PR_TYPE_EXCLUSIVE_ACCESS_ALLREG)) { struct se_node_acl *pr_res_nacl = pr_res_holder->pr_reg_nacl; pr_err("SPC-3 PR: Attempted RESERVE from" " [%s]: %s while reservation already held by" From 4f1d1adb44d160ba7c14353689b43ce1a107fba0 Mon Sep 17 00:00:00 2001 From: Lee Duncan Date: Mon, 5 Jan 2015 10:49:44 -0800 Subject: [PATCH 077/236] target: Allow Write Exclusive non-reservation holders to READ commit 1ecc7586922662e3ca2f3f0c3f17fec8749fc621 upstream. For PGR reservation of type Write Exclusive Access, allow all non reservation holding I_T nexuses with active registrations to READ from the device. This addresses a bug where active registrations that attempted to READ would result in an reservation conflict. Signed-off-by: Lee Duncan Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_pr.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 41a6bfaa22d8..70cb37516d40 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -528,6 +528,18 @@ static int core_scsi3_pr_seq_non_holder( return 0; } + } else if (we && registered_nexus) { + /* + * Reads are allowed for Write Exclusive locks + * from all registrants. + */ + if (cmd->data_direction == DMA_FROM_DEVICE) { + pr_debug("Allowing READ CDB: 0x%02x for %s" + " reservation\n", cdb[0], + core_scsi3_pr_dump_type(pr_reg_type)); + + return 0; + } } pr_debug("%s Conflict for %sregistered nexus %s CDB: 0x%2x" " for %s reservation\n", transport_dump_cmd_direction(cmd), From bdcec2cf3acd0d9ef24c653b722596f49ef6a040 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 26 Mar 2015 15:07:23 +0100 Subject: [PATCH 078/236] Linux 3.14.37 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4e6537bd8aa0..c24acc0d34a1 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 14 -SUBLEVEL = 36 +SUBLEVEL = 37 EXTRAVERSION = NAME = Remembering Coco From d472402d0d7b0d0ca9b4c1e23173669aa4a268cf Mon Sep 17 00:00:00 2001 From: Eric Nelson Date: Fri, 27 Feb 2015 08:06:45 -0700 Subject: [PATCH 079/236] ASoC: sgtl5000: remove useless register write clearing CHRGPUMP_POWERUP commit c7d910b87d3c8e9fcf4077089ca4327c12eee099 upstream. The SGTL5000_CHIP_ANA_POWER register is cached. Update the cached value instead of writing it directly. Patch inspired by Russell King's more colorful remarks in this patch: https://github.com/SolidRun/linux-imx6-3.14/commit/dd4bf6a Signed-off-by: Eric Nelson Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/sgtl5000.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sound/soc/codecs/sgtl5000.c b/sound/soc/codecs/sgtl5000.c index 715589ff0eda..e93c36fd3073 100644 --- a/sound/soc/codecs/sgtl5000.c +++ b/sound/soc/codecs/sgtl5000.c @@ -1198,13 +1198,7 @@ static int sgtl5000_set_power_regs(struct snd_soc_codec *codec) /* Enable VDDC charge pump */ ana_pwr |= SGTL5000_VDDC_CHRGPMP_POWERUP; } else if (vddio >= 3100 && vdda >= 3100) { - /* - * if vddio and vddd > 3.1v, - * charge pump should be clean before set ana_pwr - */ - snd_soc_update_bits(codec, SGTL5000_CHIP_ANA_POWER, - SGTL5000_VDDC_CHRGPMP_POWERUP, 0); - + ana_pwr &= ~SGTL5000_VDDC_CHRGPMP_POWERUP; /* VDDC use VDDIO rail */ lreg_ctrl |= SGTL5000_VDDC_ASSN_OVRD; lreg_ctrl |= SGTL5000_VDDC_MAN_ASSN_VDDIO << From 78ba5e59faddd1de7bdae70dcb74c96b6e9a3f3a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:07 +0100 Subject: [PATCH 080/236] ASoC: pcm1681: Fix wrong value references for boolean kctl commit d7f58db49d9ad92bdb12d21fdc2308b76bc2ed38 upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/pcm1681.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/pcm1681.c b/sound/soc/codecs/pcm1681.c index 73f9c3630e2c..651e2fe2c31f 100644 --- a/sound/soc/codecs/pcm1681.c +++ b/sound/soc/codecs/pcm1681.c @@ -118,7 +118,7 @@ static int pcm1681_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct pcm1681_private *priv = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = priv->deemph; + ucontrol->value.integer.value[0] = priv->deemph; return 0; } @@ -129,7 +129,7 @@ static int pcm1681_put_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct pcm1681_private *priv = snd_soc_codec_get_drvdata(codec); - priv->deemph = ucontrol->value.enumerated.item[0]; + priv->deemph = ucontrol->value.integer.value[0]; return pcm1681_set_deemph(codec); } From ba2d44c674408ee29a39b4b1705d13bc181a3219 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:05 +0100 Subject: [PATCH 081/236] ASoC: cs4271: Fix wrong value references for boolean kctl commit e8371aa0fecb73fb8a4b2e0296b025b11e7d6229 upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Acked-by: Paul Handrigan Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/cs4271.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs4271.c b/sound/soc/codecs/cs4271.c index ce05fd93dc74..a0ad41ac5574 100644 --- a/sound/soc/codecs/cs4271.c +++ b/sound/soc/codecs/cs4271.c @@ -288,7 +288,7 @@ static int cs4271_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct cs4271_private *cs4271 = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = cs4271->deemph; + ucontrol->value.integer.value[0] = cs4271->deemph; return 0; } @@ -298,7 +298,7 @@ static int cs4271_put_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct cs4271_private *cs4271 = snd_soc_codec_get_drvdata(codec); - cs4271->deemph = ucontrol->value.enumerated.item[0]; + cs4271->deemph = ucontrol->value.integer.value[0]; return cs4271_set_deemph(codec); } From ece25ec5eea1c0d45689d6b9b814b58b47311549 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:14 +0100 Subject: [PATCH 082/236] ASoC: wm8960: Fix wrong value references for boolean kctl commit b4a18c8b1af15ebfa9054a3d2aef7b0a7e6f2a05 upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Acked-by: Charles Keepax Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm8960.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wm8960.c b/sound/soc/codecs/wm8960.c index 942ef8427347..2a0bfb848512 100644 --- a/sound/soc/codecs/wm8960.c +++ b/sound/soc/codecs/wm8960.c @@ -181,7 +181,7 @@ static int wm8960_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8960_priv *wm8960 = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = wm8960->deemph; + ucontrol->value.integer.value[0] = wm8960->deemph; return 0; } @@ -190,7 +190,7 @@ static int wm8960_put_deemph(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8960_priv *wm8960 = snd_soc_codec_get_drvdata(codec); - int deemph = ucontrol->value.enumerated.item[0]; + int deemph = ucontrol->value.integer.value[0]; if (deemph > 1) return -EINVAL; From 3c13625ee6582701422d887952bf410ff5d35191 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:08 +0100 Subject: [PATCH 083/236] ASoC: tas5086: Fix wrong value references for boolean kctl commit 4c523ef61160b7d478371ddc9f48c8ce0a00d675 upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/tas5086.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/tas5086.c b/sound/soc/codecs/tas5086.c index a895a5e4bdf2..c6c65001457d 100644 --- a/sound/soc/codecs/tas5086.c +++ b/sound/soc/codecs/tas5086.c @@ -275,7 +275,7 @@ static int tas5086_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct tas5086_private *priv = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = priv->deemph; + ucontrol->value.integer.value[0] = priv->deemph; return 0; } @@ -286,7 +286,7 @@ static int tas5086_put_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct tas5086_private *priv = snd_soc_codec_get_drvdata(codec); - priv->deemph = ucontrol->value.enumerated.item[0]; + priv->deemph = ucontrol->value.integer.value[0]; return tas5086_set_deemph(codec); } From 84ae98beacc8085b7015e4597e859de1d744db4d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:10 +0100 Subject: [PATCH 084/236] ASoC: wm8731: Fix wrong value references for boolean kctl commit bd14016fbf31aa199026f1e2358eab695f374eb1 upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Acked-by: Charles Keepax Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm8731.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wm8731.c b/sound/soc/codecs/wm8731.c index 029720366ff8..e593722574de 100644 --- a/sound/soc/codecs/wm8731.c +++ b/sound/soc/codecs/wm8731.c @@ -122,7 +122,7 @@ static int wm8731_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8731_priv *wm8731 = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = wm8731->deemph; + ucontrol->value.integer.value[0] = wm8731->deemph; return 0; } @@ -132,7 +132,7 @@ static int wm8731_put_deemph(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8731_priv *wm8731 = snd_soc_codec_get_drvdata(codec); - int deemph = ucontrol->value.enumerated.item[0]; + int deemph = ucontrol->value.integer.value[0]; int ret = 0; if (deemph > 1) From fa555607cffe1bc30901faa29d38a99ca008a65b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:09 +0100 Subject: [PATCH 085/236] ASoC: wm2000: Fix wrong value references for boolean kctl commit 00a14c2968e3d55817e0fa35c78106ca840537bf upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Acked-by: Charles Keepax Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm2000.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/wm2000.c b/sound/soc/codecs/wm2000.c index 8ae50274ea8f..1a9f4574b65b 100644 --- a/sound/soc/codecs/wm2000.c +++ b/sound/soc/codecs/wm2000.c @@ -610,7 +610,7 @@ static int wm2000_anc_mode_get(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm2000_priv *wm2000 = dev_get_drvdata(codec->dev); - ucontrol->value.enumerated.item[0] = wm2000->anc_active; + ucontrol->value.integer.value[0] = wm2000->anc_active; return 0; } @@ -620,7 +620,7 @@ static int wm2000_anc_mode_put(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm2000_priv *wm2000 = dev_get_drvdata(codec->dev); - int anc_active = ucontrol->value.enumerated.item[0]; + int anc_active = ucontrol->value.integer.value[0]; int ret; if (anc_active > 1) @@ -643,7 +643,7 @@ static int wm2000_speaker_get(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm2000_priv *wm2000 = dev_get_drvdata(codec->dev); - ucontrol->value.enumerated.item[0] = wm2000->spk_ena; + ucontrol->value.integer.value[0] = wm2000->spk_ena; return 0; } @@ -653,7 +653,7 @@ static int wm2000_speaker_put(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm2000_priv *wm2000 = dev_get_drvdata(codec->dev); - int val = ucontrol->value.enumerated.item[0]; + int val = ucontrol->value.integer.value[0]; int ret; if (val > 1) From 25796c801327b7d5ae411fe891ed2e0cc647ff4c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:11 +0100 Subject: [PATCH 086/236] ASoC: wm8903: Fix wrong value references for boolean kctl commit 24cc883c1fd16df34211ae41624aa6d3cd906693 upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Acked-by: Charles Keepax Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm8903.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wm8903.c b/sound/soc/codecs/wm8903.c index eebcb1da3b7b..ae7d76efe063 100644 --- a/sound/soc/codecs/wm8903.c +++ b/sound/soc/codecs/wm8903.c @@ -442,7 +442,7 @@ static int wm8903_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8903_priv *wm8903 = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = wm8903->deemph; + ucontrol->value.integer.value[0] = wm8903->deemph; return 0; } @@ -452,7 +452,7 @@ static int wm8903_put_deemph(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8903_priv *wm8903 = snd_soc_codec_get_drvdata(codec); - int deemph = ucontrol->value.enumerated.item[0]; + int deemph = ucontrol->value.integer.value[0]; int ret = 0; if (deemph > 1) From b1598206a04269555795c6cb3377ccd73574edb3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:12 +0100 Subject: [PATCH 087/236] ASoC: wm8904: Fix wrong value references for boolean kctl commit eaddf6fd959074f6a6e71deffe079c71eef35da6 upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Acked-by: Charles Keepax Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm8904.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wm8904.c b/sound/soc/codecs/wm8904.c index 53bbfac6a83a..66cb9e95b5eb 100644 --- a/sound/soc/codecs/wm8904.c +++ b/sound/soc/codecs/wm8904.c @@ -523,7 +523,7 @@ static int wm8904_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = wm8904->deemph; + ucontrol->value.integer.value[0] = wm8904->deemph; return 0; } @@ -532,7 +532,7 @@ static int wm8904_put_deemph(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec); - int deemph = ucontrol->value.enumerated.item[0]; + int deemph = ucontrol->value.integer.value[0]; if (deemph > 1) return -EINVAL; From d0c4ddb12e4ffb87fa98a0d5eb910784daf60238 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:04 +0100 Subject: [PATCH 088/236] ASoC: ak4641: Fix wrong value references for boolean kctl commit 08641d9b7bf915144a57a736b42642e13eb1167f upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/ak4641.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/ak4641.c b/sound/soc/codecs/ak4641.c index 94cbe508dd37..d7184726f8a0 100644 --- a/sound/soc/codecs/ak4641.c +++ b/sound/soc/codecs/ak4641.c @@ -76,7 +76,7 @@ static int ak4641_put_deemph(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct ak4641_priv *ak4641 = snd_soc_codec_get_drvdata(codec); - int deemph = ucontrol->value.enumerated.item[0]; + int deemph = ucontrol->value.integer.value[0]; if (deemph > 1) return -EINVAL; @@ -92,7 +92,7 @@ static int ak4641_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct ak4641_priv *ak4641 = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = ak4641->deemph; + ucontrol->value.integer.value[0] = ak4641->deemph; return 0; }; From 2040d501863d449b9f7e5b961a592bc760af81cc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:03 +0100 Subject: [PATCH 089/236] ASoC: adav80x: Fix wrong value references for boolean kctl commit 2bf4c1d483d911cda5dd385527194d23e5cea73d upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Acked-by: Lars-Peter Clausen Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/adav80x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/adav80x.c b/sound/soc/codecs/adav80x.c index f78b27a7c461..23454e9a5d3a 100644 --- a/sound/soc/codecs/adav80x.c +++ b/sound/soc/codecs/adav80x.c @@ -319,7 +319,7 @@ static int adav80x_put_deemph(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct adav80x *adav80x = snd_soc_codec_get_drvdata(codec); - unsigned int deemph = ucontrol->value.enumerated.item[0]; + unsigned int deemph = ucontrol->value.integer.value[0]; if (deemph > 1) return -EINVAL; @@ -335,7 +335,7 @@ static int adav80x_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct adav80x *adav80x = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = adav80x->deemph; + ucontrol->value.integer.value[0] = adav80x->deemph; return 0; }; From c97bb778a0bd68966df1ce96dccae37722a65beb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Mar 2015 12:39:13 +0100 Subject: [PATCH 090/236] ASoC: wm8955: Fix wrong value references for boolean kctl commit 07892b10356f17717abdc578acbef72db86c880e upstream. The correct values referred by a boolean control are value.integer.value[], not value.enumerated.item[]. The former is long while the latter is int, so it's even incompatible on 64bit architectures. Signed-off-by: Takashi Iwai Acked-by: Charles Keepax Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm8955.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wm8955.c b/sound/soc/codecs/wm8955.c index 82c8ba975720..1c1fc6119758 100644 --- a/sound/soc/codecs/wm8955.c +++ b/sound/soc/codecs/wm8955.c @@ -393,7 +393,7 @@ static int wm8955_get_deemph(struct snd_kcontrol *kcontrol, struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8955_priv *wm8955 = snd_soc_codec_get_drvdata(codec); - ucontrol->value.enumerated.item[0] = wm8955->deemph; + ucontrol->value.integer.value[0] = wm8955->deemph; return 0; } @@ -402,7 +402,7 @@ static int wm8955_put_deemph(struct snd_kcontrol *kcontrol, { struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); struct wm8955_priv *wm8955 = snd_soc_codec_get_drvdata(codec); - int deemph = ucontrol->value.enumerated.item[0]; + int deemph = ucontrol->value.integer.value[0]; if (deemph > 1) return -EINVAL; From 9bb4c4d2e61649c1590470fb0ac80b448f389405 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Tue, 22 Apr 2014 22:46:31 +0200 Subject: [PATCH 091/236] ASoC: jz4740: Remove Makefile entry for removed file commit eebdec044e82a4c694b7c1f4996e3ab82d64482c upstream. Commit 0406a40a0 ("ASoC: jz4740: Use the generic dmaengine PCM driver") jz4740-pcm.c file, but neglected to remove the Makefile entries. Fixes: 0406a40a0 ("ASoC: jz4740: Use the generic dmaengine PCM driver") Reported-by: kbuild test robot Reported-by: Ralf Baechle Signed-off-by: Lars-Peter Clausen Signed-off-by: Mark Brown Cc: Markos Chandras Signed-off-by: Greg Kroah-Hartman --- sound/soc/jz4740/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/sound/soc/jz4740/Makefile b/sound/soc/jz4740/Makefile index be873c1b0c20..d32c540555c4 100644 --- a/sound/soc/jz4740/Makefile +++ b/sound/soc/jz4740/Makefile @@ -1,10 +1,8 @@ # # Jz4740 Platform Support # -snd-soc-jz4740-objs := jz4740-pcm.o snd-soc-jz4740-i2s-objs := jz4740-i2s.o -obj-$(CONFIG_SND_JZ4740_SOC) += snd-soc-jz4740.o obj-$(CONFIG_SND_JZ4740_SOC_I2S) += snd-soc-jz4740-i2s.o # Jz4740 Machine Support From 1e529ecd360d27e5fb15782df14d9c5a43ecda13 Mon Sep 17 00:00:00 2001 From: Yongbae Park Date: Tue, 3 Mar 2015 13:05:48 +0900 Subject: [PATCH 092/236] clockevents: sun5i: Fix setup_irq init sequence commit 1096be084ac59927158ce80ff1d31c33eed0e565 upstream. The interrupt is enabled before the handler is set. Even this bug did not appear, it is potentially dangerous as it can lead to a NULL pointer dereference. Fix the error by enabling the interrupt after clockevents_config_and_register() is called. Signed-off-by: Yongbae Park Signed-off-by: Daniel Lezcano Signed-off-by: Greg Kroah-Hartman --- drivers/clocksource/timer-sun5i.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c index deebcd6469fc..4f0f3610371a 100644 --- a/drivers/clocksource/timer-sun5i.c +++ b/drivers/clocksource/timer-sun5i.c @@ -172,10 +172,6 @@ static void __init sun5i_timer_init(struct device_node *node) ticks_per_jiffy = DIV_ROUND_UP(rate, HZ); - ret = setup_irq(irq, &sun5i_timer_irq); - if (ret) - pr_warn("failed to setup irq %d\n", irq); - /* Enable timer0 interrupt */ val = readl(timer_base + TIMER_IRQ_EN_REG); writel(val | TIMER_IRQ_EN(0), timer_base + TIMER_IRQ_EN_REG); @@ -185,6 +181,10 @@ static void __init sun5i_timer_init(struct device_node *node) clockevents_config_and_register(&sun5i_clockevent, rate, TIMER_SYNC_TICKS, 0xffffffff); + + ret = setup_irq(irq, &sun5i_timer_irq); + if (ret) + pr_warn("failed to setup irq %d\n", irq); } CLOCKSOURCE_OF_DECLARE(sun5i_a13, "allwinner,sun5i-a13-hstimer", sun5i_timer_init); From 0825465c936bcb8feba95d8b40f86dbd716b6623 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Sat, 7 Mar 2015 17:10:01 +0100 Subject: [PATCH 093/236] regmap: regcache-rbtree: Fix present bitmap resize commit 328f494d95aac8bd4896aea2328bc281053bcb71 upstream. When inserting a new register into a block at the lower end the present bitmap is currently shifted into the wrong direction. The effect of this is that the bitmap becomes corrupted and registers which are present might be reported as not present and vice versa. Fix this by shifting left rather than right. Fixes: 472fdec7380c("regmap: rbtree: Reduce number of nodes, take 2") Reported-by: Daniel Baluta Signed-off-by: Lars-Peter Clausen Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/base/regmap/regcache-rbtree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c index 930cad4e5df8..2b946bc4212d 100644 --- a/drivers/base/regmap/regcache-rbtree.c +++ b/drivers/base/regmap/regcache-rbtree.c @@ -313,7 +313,7 @@ static int regcache_rbtree_insert_to_block(struct regmap *map, if (pos == 0) { memmove(blk + offset * map->cache_word_size, blk, rbnode->blklen * map->cache_word_size); - bitmap_shift_right(present, present, offset, blklen); + bitmap_shift_left(present, present, offset, blklen); } /* update the rbnode block, its size and the base register */ From c75a45eba48c9cde17c1041db76ba1f7596049f8 Mon Sep 17 00:00:00 2001 From: Yongbae Park Date: Tue, 3 Mar 2015 19:46:49 +0900 Subject: [PATCH 094/236] clocksource: efm32: Fix a NULL pointer dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7b8f10da3bf1056546133c9f54f49ce389fd95ab upstream. The initialisation of the efm32 clocksource first sets up the irq and only after that initialises the data needed for irq handling. In case this initialisation is delayed the irq handler would dereference a NULL pointer. I'm not aware of anything that could delay the process in such a way, but it's better to be safe than sorry, so setup the irq only when the clock event device is ready. Acked-by: Uwe Kleine-König Signed-off-by: Yongbae Park Signed-off-by: Daniel Lezcano Signed-off-by: Greg Kroah-Hartman --- drivers/clocksource/time-efm32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/time-efm32.c b/drivers/clocksource/time-efm32.c index 1a6205b7bed3..35b016100673 100644 --- a/drivers/clocksource/time-efm32.c +++ b/drivers/clocksource/time-efm32.c @@ -225,12 +225,12 @@ static int __init efm32_clockevent_init(struct device_node *np) clock_event_ddata.base = base; clock_event_ddata.periodic_top = DIV_ROUND_CLOSEST(rate, 1024 * HZ); - setup_irq(irq, &efm32_clock_event_irq); - clockevents_config_and_register(&clock_event_ddata.evtdev, DIV_ROUND_CLOSEST(rate, 1024), 0xf, 0xffff); + setup_irq(irq, &efm32_clock_event_irq); + return 0; err_get_irq: From 15df865ebb3e20909515287bf3f379db13d1786d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Feb 2015 16:21:03 +0300 Subject: [PATCH 095/236] tcm_fc: missing curly braces in ft_invl_hw_context() commit d556546e7ecd9fca199df4698943024d40044f8e upstream. This patch adds a missing set of conditional check braces in ft_invl_hw_context() originally introduced by commit dcd998ccd when handling DDP failures in ft_recv_write_data() code. commit dcd998ccdbf74a7d8fe0f0a44e85da1ed5975946 Author: Kiran Patil Date: Wed Aug 3 09:20:01 2011 +0000 tcm_fc: Handle DDP/SW fc_frame_payload_get failures in ft_recv_write_data Signed-off-by: Dan Carpenter Cc: Kiran Patil Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/tcm_fc/tfc_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c index e415af32115a..c67d3795db4a 100644 --- a/drivers/target/tcm_fc/tfc_io.c +++ b/drivers/target/tcm_fc/tfc_io.c @@ -346,7 +346,7 @@ void ft_invl_hw_context(struct ft_cmd *cmd) ep = fc_seq_exch(seq); if (ep) { lport = ep->lp; - if (lport && (ep->xid <= lport->lro_xid)) + if (lport && (ep->xid <= lport->lro_xid)) { /* * "ddp_done" trigger invalidation of HW * specific DDP context @@ -361,6 +361,7 @@ void ft_invl_hw_context(struct ft_cmd *cmd) * identified using ep->xid) */ cmd->was_ddp_setup = 0; + } } } } From 5262307d9f044d0c50106a62348c3794b5c76662 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 19 Mar 2015 22:25:16 -0700 Subject: [PATCH 096/236] tcm_qla2xxx: Fix incorrect use of __transport_register_session commit 75c3d0bf9caebb502e96683b2bc37f9692437e68 upstream. This patch fixes the incorrect use of __transport_register_session() in tcm_qla2xxx_check_initiator_node_acl() code, that does not perform explicit se_tpg->session_lock when accessing se_tpg->tpg_sess_list to add new se_sess nodes. Given that tcm_qla2xxx_check_initiator_node_acl() is not called with qla_hw->hardware_lock held for all accesses of ->tpg_sess_list, the code should be using transport_register_session() instead. Signed-off-by: Bart Van Assche Cc: Giridhar Malavali Cc: Quinn Tran Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/tcm_qla2xxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index 9d81f7693f99..1817f3f2b02d 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -1515,7 +1515,7 @@ static int tcm_qla2xxx_check_initiator_node_acl( /* * Finally register the new FC Nexus with TCM */ - __transport_register_session(se_nacl->se_tpg, se_nacl, se_sess, sess); + transport_register_session(se_nacl->se_tpg, se_nacl, se_sess, sess); return 0; } From 25a9b87706ff75ae1ddd5137635961232640db0c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Mar 2015 08:53:27 +0200 Subject: [PATCH 097/236] nl80211: ignore HT/VHT capabilities without QoS/WMM commit 496fcc294daab18799e190c0264863d653588d1f upstream. As HT/VHT depend heavily on QoS/WMM, it's not a good idea to let userspace add clients that have HT/VHT but not QoS/WMM. Since it does so in certain cases we've observed (client is using HT IEs but not QoS/WMM) just ignore the HT/VHT info at this point and don't pass it down to the drivers which might unconditionally use it. Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/wireless/nl80211.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 18d73df72531..c260243dbe07 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4190,6 +4190,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) return -EINVAL; + /* HT/VHT requires QoS, but if we don't have that just ignore HT/VHT + * as userspace might just pass through the capabilities from the IEs + * directly, rather than enforcing this restriction and returning an + * error in this case. + */ + if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { + params.ht_capa = NULL; + params.vht_capa = NULL; + } + /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); From 3ff96867c55f70d0625aa4fdbbae61d66f488a5c Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Tue, 10 Feb 2015 12:48:44 +0100 Subject: [PATCH 098/236] mac80211: disable u-APSD queues by default commit aa75ebc275b2a91b193654a177daf900ad6703f0 upstream. Some APs experience problems when working with U-APSD. Decreasing the probability of that happening by using legacy mode for all ACs but VO isn't enough. Cisco 4410N originally forced us to enable VO by default only because it treated non-VO ACs as legacy. However some APs (notably Netgear R7000) silently reclassify packets to different ACs. Since u-APSD ACs require trigger frames for frame retrieval clients would never see some frames (e.g. ARP responses) or would fetch them accidentally after a long time. It makes little sense to enable u-APSD queues by default because it needs userspace applications to be aware of it to actually take advantage of the possible additional powersavings. Implicitly depending on driver autotrigger frame support doesn't make much sense. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/ieee80211_i.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index bf7a1bbb975f..e278c64572de 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -57,13 +57,24 @@ struct ieee80211_local; #define IEEE80211_UNSET_POWER_LEVEL INT_MIN /* - * Some APs experience problems when working with U-APSD. Decrease the - * probability of that happening by using legacy mode for all ACs but VO. - * The AP that caused us trouble was a Cisco 4410N. It ignores our - * setting, and always treats non-VO ACs as legacy. + * Some APs experience problems when working with U-APSD. Decreasing the + * probability of that happening by using legacy mode for all ACs but VO isn't + * enough. + * + * Cisco 4410N originally forced us to enable VO by default only because it + * treated non-VO ACs as legacy. + * + * However some APs (notably Netgear R7000) silently reclassify packets to + * different ACs. Since u-APSD ACs require trigger frames for frame retrieval + * clients would never see some frames (e.g. ARP responses) or would fetch them + * accidentally after a long time. + * + * It makes little sense to enable u-APSD queues by default because it needs + * userspace applications to be aware of it to actually take advantage of the + * possible additional powersavings. Implicitly depending on driver autotrigger + * frame support doesn't make much sense. */ -#define IEEE80211_DEFAULT_UAPSD_QUEUES \ - IEEE80211_WMM_IE_STA_QOSINFO_AC_VO +#define IEEE80211_DEFAULT_UAPSD_QUEUES 0 #define IEEE80211_DEFAULT_MAX_SP_LEN \ IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL From 21fa7aacaeab9ddf6147b972ae52622424c57874 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Mon, 2 Mar 2015 14:28:52 -0500 Subject: [PATCH 099/236] mac80211: drop unencrypted frames in mesh fwding commit d0c22119f574b851e63360c6b8660fe9593bbc3c upstream. The mesh forwarding path was not checking that data frames were protected when running an encrypted network; add the necessary check. Reported-by: Johannes Berg Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/rx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 815ca56b39e7..9abb445ea261 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2107,6 +2107,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) hdr = (struct ieee80211_hdr *) skb->data; mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); + if (ieee80211_drop_unencrypted(rx, hdr->frame_control)) + return RX_DROP_MONITOR; + /* frame is in RMC, don't forward */ if (ieee80211_is_data(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && From d3f483f0a6728e08625dc43a73b69fda30117571 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 6 Apr 2015 12:18:59 +0200 Subject: [PATCH 100/236] Revert "iwlwifi: mvm: fix failure path when power_update fails in add_interface" This reverts commit 5ac97a1b97e50ef197b64b72c1cd71779e4ecbbe It was incorrectly applied, as it merged with fuzz. Reported-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org --- drivers/net/wireless/iwlwifi/mvm/mac80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/iwlwifi/mvm/mac80211.c index a04174607e97..4a3b8b72c8af 100644 --- a/drivers/net/wireless/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/iwlwifi/mvm/mac80211.c @@ -594,7 +594,7 @@ static int iwl_mvm_mac_add_interface(struct ieee80211_hw *hw, ret = iwl_mvm_mac_ctxt_add(mvm, vif); if (ret) - goto out_remove_mac; + goto out_release; iwl_mvm_power_disable(mvm, vif); From 9e8da08d94891046fe64fc4c8f4c091549975e74 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 25 Feb 2015 16:16:29 +0100 Subject: [PATCH 101/236] phy: Find the right match in devm_phy_destroy() commit 2f1bce487cd0a02623cff3d877940f9a2026341c upstream. devm_phy_create() stores the pointer to the new PHY at the address returned by devres_alloc(). The res parameter passed to devm_phy_match() is therefore the location where the pointer to the PHY is stored, hence it needs to be dereferenced before comparing to the match data in order to find the correct match. Signed-off-by: Thierry Reding Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Greg Kroah-Hartman --- drivers/phy/phy-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index 34d56f7864d6..86592dd24da5 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -50,7 +50,9 @@ static void devm_phy_consume(struct device *dev, void *res) static int devm_phy_match(struct device *dev, void *res, void *match_data) { - return res == match_data; + struct phy **phy = res; + + return *phy == match_data; } static struct phy *phy_lookup(struct device *device, const char *port) From 7dc9ca26d28f7aa4f58db7e0b824ea01a3009ff1 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 18 Mar 2015 00:21:32 +0200 Subject: [PATCH 102/236] of/irq: Fix of_irq_parse_one() returned error codes commit d7c146053dd195b90c79b9b8131431f44541d015 upstream. The error code paths that require cleanup use a goto to jump to the cleanup code and return an error code. However, the error code variable res, which is initialized to -EINVAL when declared, is then overwritten with the return value of of_parse_phandle_with_args(), and reused as the return code from of_irq_parse_one(). This leads to an undetermined error being returned instead of the expected -EINVAL value. Fix it. Signed-off-by: Laurent Pinchart Signed-off-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- drivers/of/irq.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 48f20ff1add9..bbff99dcbaea 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -290,7 +290,7 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar struct device_node *p; const __be32 *intspec, *tmp, *addr; u32 intsize, intlen; - int i, res = -EINVAL; + int i, res; pr_debug("of_irq_parse_one: dev=%s, index=%d\n", of_node_full_name(device), index); @@ -323,15 +323,19 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar /* Get size of interrupt specifier */ tmp = of_get_property(p, "#interrupt-cells", NULL); - if (tmp == NULL) + if (tmp == NULL) { + res = -EINVAL; goto out; + } intsize = be32_to_cpu(*tmp); pr_debug(" intsize=%d intlen=%d\n", intsize, intlen); /* Check index */ - if ((index + 1) * intsize > intlen) + if ((index + 1) * intsize > intlen) { + res = -EINVAL; goto out; + } /* Copy intspec into irq structure */ intspec += index * intsize; From 90fbc7e1b94f2a033220f1b3e7652762117707d5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Feb 2015 18:03:11 +0100 Subject: [PATCH 103/236] perf: Fix irq_work 'tail' recursion commit d525211f9d1be8b523ec7633f080f2116f5ea536 upstream. Vince reported a watchdog lockup like: [] perf_tp_event+0xc4/0x210 [] perf_trace_lock+0x12a/0x160 [] lock_release+0x130/0x260 [] _raw_spin_unlock_irqrestore+0x24/0x40 [] do_send_sig_info+0x5d/0x80 [] send_sigio_to_task+0x12f/0x1a0 [] send_sigio+0xae/0x100 [] kill_fasync+0x97/0xf0 [] perf_event_wakeup+0xd4/0xf0 [] perf_pending_event+0x33/0x60 [] irq_work_run_list+0x4c/0x80 [] irq_work_run+0x18/0x40 [] smp_trace_irq_work_interrupt+0x3f/0xc0 [] trace_irq_work_interrupt+0x6d/0x80 Which is caused by an irq_work generating new irq_work and therefore not allowing forward progress. This happens because processing the perf irq_work triggers another perf event (tracepoint stuff) which in turn generates an irq_work ad infinitum. Avoid this by raising the recursion counter in the irq_work -- which effectively disables all software events (including tracepoints) from actually triggering again. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Paul Mackerras Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20150219170311.GH21418@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 69cffb46db17..60146febb9b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4232,6 +4232,13 @@ static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); + int rctx; + + rctx = perf_swevent_get_recursion_context(); + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ if (event->pending_disable) { event->pending_disable = 0; @@ -4242,6 +4249,9 @@ static void perf_pending_event(struct irq_work *entry) event->pending_wakeup = 0; perf_event_wakeup(event); } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } /* From c1cc48adbfa3f864cf0667c574c2f6700045fe11 Mon Sep 17 00:00:00 2001 From: Malcolm Priestley Date: Sat, 7 Mar 2015 16:36:37 +0000 Subject: [PATCH 104/236] staging: vt6656: vnt_rf_setpower: fix missing rate RATE_12M commit 163fe301b9f78b6de57d0014eafe504fd20c0cd4 upstream. When the driver sets this rate a power of zero value is set causing data flow stoppage until another rate is tried. Signed-off-by: Malcolm Priestley Signed-off-by: Greg Kroah-Hartman --- drivers/staging/vt6656/rf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/vt6656/rf.c b/drivers/staging/vt6656/rf.c index 1e8f64bff03c..2dc48d4ff433 100644 --- a/drivers/staging/vt6656/rf.c +++ b/drivers/staging/vt6656/rf.c @@ -752,6 +752,7 @@ int RFbSetPower(struct vnt_private *priv, u32 rate, u32 channel) break; case RATE_6M: case RATE_9M: + case RATE_12M: case RATE_18M: case RATE_24M: case RATE_36M: From 5228d667508eeb76d2a34d82582dd6e2e7d147b0 Mon Sep 17 00:00:00 2001 From: Malcolm Priestley Date: Sat, 7 Mar 2015 17:04:54 +0000 Subject: [PATCH 105/236] vt6655: RFbSetPower fix missing rate RATE_12M commit 40c8790bcb7ac74f3038153cd09310e220c6a1df upstream. When the driver sets this rate a power of zero value is set causing data flow stoppage until another rate is tried. Signed-off-by: Malcolm Priestley Signed-off-by: Greg Kroah-Hartman --- drivers/staging/vt6655/rf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/vt6655/rf.c b/drivers/staging/vt6655/rf.c index edb1b2768b17..dbd9d44919ac 100644 --- a/drivers/staging/vt6655/rf.c +++ b/drivers/staging/vt6655/rf.c @@ -936,6 +936,7 @@ bool RFbSetPower( break; case RATE_6M: case RATE_9M: + case RATE_12M: case RATE_18M: byPwr = pDevice->abyOFDMPwrTbl[uCH]; if (pDevice->byRFType == RF_UW2452) { From 1be347b870e07a0cff57bcba2bbf5f538c68d6a4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 9 Mar 2015 12:16:42 +0200 Subject: [PATCH 106/236] dmaengine: dw: append MODULE_ALIAS for platform driver commit a104a45ba7a51b5b4c5e8437020d9d48edf22f89 upstream. The commit 9cade1a46c77 (dma: dw: split driver to library part and platform code) introduced a separate platform driver but missed to add a MODULE_ALIAS("platform:dw_dmac"); to that module. The patch adds this to get driver loaded automatically if platform device is registered. Reported-by: "Blin, Jerome" Fixes: 9cade1a46c77 (dma: dw: split driver to library part and platform code) Signed-off-by: Andy Shevchenko Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/dw/platform.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/dma/dw/platform.c b/drivers/dma/dw/platform.c index 453822cc4f9d..fe8b0c991518 100644 --- a/drivers/dma/dw/platform.c +++ b/drivers/dma/dw/platform.c @@ -48,6 +48,8 @@ static bool dw_dma_of_filter(struct dma_chan *chan, void *param) return true; } +#define DRV_NAME "dw_dmac" + static struct dma_chan *dw_dma_of_xlate(struct of_phandle_args *dma_spec, struct of_dma *ofdma) { @@ -293,7 +295,7 @@ static struct platform_driver dw_driver = { .remove = dw_remove, .shutdown = dw_shutdown, .driver = { - .name = "dw_dmac", + .name = DRV_NAME, .pm = &dw_dev_pm_ops, .of_match_table = of_match_ptr(dw_dma_of_id_table), .acpi_match_table = ACPI_PTR(dw_dma_acpi_id_table), @@ -314,3 +316,4 @@ module_exit(dw_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Synopsys DesignWare DMA Controller platform driver"); +MODULE_ALIAS("platform:" DRV_NAME); From efad4bcc37b3d5f528f0a118b4dbaf5079e0c2f2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Feb 2015 14:04:27 -0500 Subject: [PATCH 107/236] dm: hold suspend_lock while suspending device during device deletion commit ab7c7bb6f4ab95dbca96fcfc4463cd69843e3e24 upstream. __dm_destroy() must take the suspend_lock so that its presuspend and postsuspend calls do not race with an internal suspend. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1582c3dac3ac..e40059143272 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2352,10 +2352,16 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); + /* + * Take suspend_lock so that presuspend and postsuspend methods + * do not race with internal suspend. + */ + mutex_lock(&md->suspend_lock); if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); dm_table_postsuspend_targets(map); } + mutex_unlock(&md->suspend_lock); /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ dm_put_live_table(md, srcu_idx); From f4e6c2337d11a4b8df1a3a9ef5d670839441f772 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 Feb 2015 10:44:38 -0800 Subject: [PATCH 108/236] dm io: deal with wandering queue limits when handling REQ_DISCARD and REQ_WRITE_SAME commit e5db29806b99ce2b2640d2e4d4fcb983cea115c5 upstream. Since it's possible for the discard and write same queue limits to change while the upper level command is being sliced and diced, fix up both of them (a) to reject IO if the special command is unsupported at the start of the function and (b) read the limits once and let the commands error out on their own if the status happens to change. Signed-off-by: Darrick J. Wong Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-io.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index d2a8d64f8526..080e767250d3 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -291,9 +291,16 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, struct request_queue *q = bdev_get_queue(where->bdev); unsigned short logical_block_size = queue_logical_block_size(q); sector_t num_sectors; + unsigned int uninitialized_var(special_cmd_max_sectors); - /* Reject unsupported discard requests */ - if ((rw & REQ_DISCARD) && !blk_queue_discard(q)) { + /* + * Reject unsupported discard and write same requests. + */ + if (rw & REQ_DISCARD) + special_cmd_max_sectors = q->limits.max_discard_sectors; + else if (rw & REQ_WRITE_SAME) + special_cmd_max_sectors = q->limits.max_write_same_sectors; + if ((rw & (REQ_DISCARD | REQ_WRITE_SAME)) && special_cmd_max_sectors == 0) { dec_count(io, region, -EOPNOTSUPP); return; } @@ -319,7 +326,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, store_io_and_region_in_bio(bio, io, region); if (rw & REQ_DISCARD) { - num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); + num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; remaining -= num_sectors; } else if (rw & REQ_WRITE_SAME) { @@ -328,7 +335,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, */ dp->get_page(dp, &page, &len, &offset); bio_add_page(bio, page, logical_block_size, offset); - num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining); + num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; offset = 0; From 4172dbbeb2d36551ac94ede5959000100d2bada1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 18 Mar 2015 11:27:28 +0100 Subject: [PATCH 109/236] spi: trigger trace event for message-done before mesg->complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 391949b6f02121371e3d7d9082c6d17fd9853034 upstream. With spidev the mesg->complete callback points to spidev_complete. Calling this unblocks spidev_sync and so spidev_sync_write finishes. As the struct spi_message just read is a local variable in spidev_sync_write and recording the trace event accesses this message the recording is better done first. The same can happen for spidev_sync_read. This fixes an oops observed on a 3.14-rt system with spidev activity after echo 1 > /sys/kernel/debug/tracing/events/spi/enable . Signed-off-by: Uwe Kleine-König Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index fbf3b22efe5a..d6563ec700d4 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -846,13 +846,14 @@ void spi_finalize_current_message(struct spi_master *master) "failed to unprepare message: %d\n", ret); } } + + trace_spi_message_done(mesg); + master->cur_msg_prepared = false; mesg->state = NULL; if (mesg->complete) mesg->complete(mesg->context); - - trace_spi_message_done(mesg); } EXPORT_SYMBOL_GPL(spi_finalize_current_message); From 4bcbae54813fc518f4a741e871a96dd5f23ba863 Mon Sep 17 00:00:00 2001 From: Sergei Antonov Date: Wed, 25 Mar 2015 15:55:34 -0700 Subject: [PATCH 110/236] hfsplus: fix B-tree corruption after insertion at position 0 commit 98cf21c61a7f5419d82f847c4d77bf6e96a76f5f upstream. Fix B-tree corruption when a new record is inserted at position 0 in the node in hfs_brec_insert(). In this case a hfs_brec_update_parent() is called to update the parent index node (if exists) and it is passed hfs_find_data with a search_key containing a newly inserted key instead of the key to be updated. This results in an inconsistent index node. The bug reproduces on my machine after an extents overflow record for the catalog file (CNID=4) is inserted into the extents overflow B-tree. Because of a low (reserved) value of CNID=4, it has to become the first record in the first leaf node. The resulting first leaf node is correct: ---------------------------------------------------- | key0.CNID=4 | key1.CNID=123 | key2.CNID=456, ... | ---------------------------------------------------- But the parent index key0 still contains the previous key CNID=123: ----------------------- | key0.CNID=123 | ... | ----------------------- A change in hfs_brec_insert() makes hfs_brec_update_parent() work correctly by preventing it from getting fd->record=-1 value from __hfs_brec_find(). Along the way, I removed duplicate code with unification of the if condition. The resulting code is equivalent to the original code because node is never 0. Also hfs_brec_update_parent() will now return an error after getting a negative fd->record value. However, the return value of hfs_brec_update_parent() is not checked anywhere in the file and I'm leaving it unchanged by this patch. brec.c lacks error checking after some other calls too, but this issue is of less importance than the one being fixed by this patch. Signed-off-by: Sergei Antonov Cc: Joe Perches Reviewed-by: Vyacheslav Dubeyko Acked-by: Hin-Tak Leung Cc: Anton Altaparmakov Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hfsplus/brec.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 6e560d56094b..754fdf8c6356 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -131,13 +131,16 @@ skip: hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -168,9 +171,6 @@ skip: goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -370,6 +370,8 @@ again: if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd, hfs_find_rec_by_key); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; From 31e970605408a0656d90cca2f2a727ae8eb3c3fe Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 17 Mar 2015 16:14:41 +0530 Subject: [PATCH 111/236] powerpc/book3s: Fix the MCE code to use CONFIG_KVM_BOOK3S_64_HANDLER commit 44d5f6f5901e996744858c175baee320ccf1eda3 upstream. commit id 2ba9f0d has changed CONFIG_KVM_BOOK3S_64_HV to tristate to allow HV/PR bits to be built as modules. But the MCE code still depends on CONFIG_KVM_BOOK3S_64_HV which is wrong. When user selects CONFIG_KVM_BOOK3S_64_HV=m to build HV/PR bits as a separate module the relevant MCE code gets excluded. This patch fixes the MCE code to use CONFIG_KVM_BOOK3S_64_HANDLER. This makes sure that the relevant MCE code is included when HV/PR bits are built as a separate modules. Fixes: 2ba9f0d88750 ("kvm: powerpc: book3s: Support building HV and PR KVM as module") Signed-off-by: Mahesh Salgaonkar Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/exceptions-64s.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 38d507306a11..5193116eadc0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1422,7 +1422,7 @@ machine_check_handle_early: bne 9f /* continue in V mode if we are. */ 5: -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* * We are coming from kernel context. Check if we are coming from * guest. if yes, then we can continue. We will fall through From 7edf68cf19a58342ef270621438a5e8c1045c34b Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Mar 2015 15:06:50 +0000 Subject: [PATCH 112/236] arm64: Use the reserved TTBR0 if context switching to the init_mm commit e53f21bce4d35a93b23d8fa1a840860f6c74f59e upstream. The idle_task_exit() function may call switch_mm() with next == &init_mm. On arm64, init_mm.pgd cannot be used for user mappings, so this patch simply sets the reserved TTBR0. Reported-by: Jon Medhurst (Tixy) Tested-by: Jon Medhurst (Tixy) Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/mmu_context.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a9eee33dfa62..101a42bde728 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -151,6 +151,15 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, { unsigned int cpu = smp_processor_id(); + /* + * init_mm.pgd does not contain any user mappings and it is always + * active for kernel addresses in TTBR1. Just set the reserved TTBR0. + */ + if (next == &init_mm) { + cpu_set_reserved_ttbr0(); + return; + } + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next) check_and_switch_context(next, tsk); } From 5bd657f3ac9ef3b8d9e4034675cd7d60a7cb57ef Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 4 Mar 2015 11:59:33 -0800 Subject: [PATCH 113/236] powerpc/pseries: Little endian fixes for post mobility device tree update commit f6ff04149637723261aa4738958b0098b929ee9e upstream. We currently use the device tree update code in the kernel after resuming from a suspend operation to re-sync the kernels view of the device tree with that of the hypervisor. The code as it stands is not endian safe as it relies on parsing buffers returned by RTAS calls that thusly contains data in big endian format. This patch annotates variables and structure members with __be types as well as performing necessary byte swaps to cpu endian for data that needs to be parsed. Signed-off-by: Tyrel Datwyler Cc: Nathan Fontenot Cc: Cyril Bur Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/mobility.c | 44 ++++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index cde4e0a095ae..bf3829242aff 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -24,10 +24,10 @@ static struct kobject *mobility_kobj; struct update_props_workarea { - u32 phandle; - u32 state; - u64 reserved; - u32 nprops; + __be32 phandle; + __be32 state; + __be64 reserved; + __be32 nprops; } __packed; #define NODE_ACTION_MASK 0xff000000 @@ -53,11 +53,11 @@ static int mobility_rtas_call(int token, char *buf, s32 scope) return rc; } -static int delete_dt_node(u32 phandle) +static int delete_dt_node(__be32 phandle) { struct device_node *dn; - dn = of_find_node_by_phandle(phandle); + dn = of_find_node_by_phandle(be32_to_cpu(phandle)); if (!dn) return -ENOENT; @@ -126,7 +126,7 @@ static int update_dt_property(struct device_node *dn, struct property **prop, return 0; } -static int update_dt_node(u32 phandle, s32 scope) +static int update_dt_node(__be32 phandle, s32 scope) { struct update_props_workarea *upwa; struct device_node *dn; @@ -135,6 +135,7 @@ static int update_dt_node(u32 phandle, s32 scope) char *prop_data; char *rtas_buf; int update_properties_token; + u32 nprops; u32 vd; update_properties_token = rtas_token("ibm,update-properties"); @@ -145,7 +146,7 @@ static int update_dt_node(u32 phandle, s32 scope) if (!rtas_buf) return -ENOMEM; - dn = of_find_node_by_phandle(phandle); + dn = of_find_node_by_phandle(be32_to_cpu(phandle)); if (!dn) { kfree(rtas_buf); return -ENOENT; @@ -161,6 +162,7 @@ static int update_dt_node(u32 phandle, s32 scope) break; prop_data = rtas_buf + sizeof(*upwa); + nprops = be32_to_cpu(upwa->nprops); /* On the first call to ibm,update-properties for a node the * the first property value descriptor contains an empty @@ -169,17 +171,17 @@ static int update_dt_node(u32 phandle, s32 scope) */ if (*prop_data == 0) { prop_data++; - vd = *(u32 *)prop_data; + vd = be32_to_cpu(*(__be32 *)prop_data); prop_data += vd + sizeof(vd); - upwa->nprops--; + nprops--; } - for (i = 0; i < upwa->nprops; i++) { + for (i = 0; i < nprops; i++) { char *prop_name; prop_name = prop_data; prop_data += strlen(prop_name) + 1; - vd = *(u32 *)prop_data; + vd = be32_to_cpu(*(__be32 *)prop_data); prop_data += sizeof(vd); switch (vd) { @@ -211,13 +213,13 @@ static int update_dt_node(u32 phandle, s32 scope) return 0; } -static int add_dt_node(u32 parent_phandle, u32 drc_index) +static int add_dt_node(__be32 parent_phandle, __be32 drc_index) { struct device_node *dn; struct device_node *parent_dn; int rc; - parent_dn = of_find_node_by_phandle(parent_phandle); + parent_dn = of_find_node_by_phandle(be32_to_cpu(parent_phandle)); if (!parent_dn) return -ENOENT; @@ -236,7 +238,7 @@ static int add_dt_node(u32 parent_phandle, u32 drc_index) int pseries_devicetree_update(s32 scope) { char *rtas_buf; - u32 *data; + __be32 *data; int update_nodes_token; int rc; @@ -253,17 +255,17 @@ int pseries_devicetree_update(s32 scope) if (rc && rc != 1) break; - data = (u32 *)rtas_buf + 4; - while (*data & NODE_ACTION_MASK) { + data = (__be32 *)rtas_buf + 4; + while (be32_to_cpu(*data) & NODE_ACTION_MASK) { int i; - u32 action = *data & NODE_ACTION_MASK; - int node_count = *data & NODE_COUNT_MASK; + u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK; + u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK; data++; for (i = 0; i < node_count; i++) { - u32 phandle = *data++; - u32 drc_index; + __be32 phandle = *data++; + __be32 drc_index; switch (action) { case DELETE_DT_NODE: From 12c46a1ecb1e8fc13b25402efb41dda20cae3c96 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 17 Dec 2014 19:06:31 -0600 Subject: [PATCH 114/236] powerpc/mpc85xx: Add ranges to etsec2 nodes commit bb344ca5b90df62b1a3b7a35c6a9d00b306a170d upstream. Commit 746c9e9f92dd "of/base: Fix PowerPC address parsing hack" limited the applicability of the workaround whereby a missing ranges is treated as an empty ranges. This workaround was hiding a bug in the etsec2 device tree nodes, which have children with reg, but did not have ranges. Signed-off-by: Scott Wood Reported-by: Alexander Graf Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/boot/dts/fsl/pq3-etsec2-0.dtsi | 1 + arch/powerpc/boot/dts/fsl/pq3-etsec2-1.dtsi | 1 + arch/powerpc/boot/dts/fsl/pq3-etsec2-2.dtsi | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/powerpc/boot/dts/fsl/pq3-etsec2-0.dtsi b/arch/powerpc/boot/dts/fsl/pq3-etsec2-0.dtsi index 1382fec9e8c5..7fcb1ac0f232 100644 --- a/arch/powerpc/boot/dts/fsl/pq3-etsec2-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/pq3-etsec2-0.dtsi @@ -50,6 +50,7 @@ ethernet@b0000 { fsl,num_tx_queues = <0x8>; fsl,magic-packet; local-mac-address = [ 00 00 00 00 00 00 ]; + ranges; queue-group@b0000 { #address-cells = <1>; diff --git a/arch/powerpc/boot/dts/fsl/pq3-etsec2-1.dtsi b/arch/powerpc/boot/dts/fsl/pq3-etsec2-1.dtsi index 221cd2ea5b31..9f25427c1527 100644 --- a/arch/powerpc/boot/dts/fsl/pq3-etsec2-1.dtsi +++ b/arch/powerpc/boot/dts/fsl/pq3-etsec2-1.dtsi @@ -50,6 +50,7 @@ ethernet@b1000 { fsl,num_tx_queues = <0x8>; fsl,magic-packet; local-mac-address = [ 00 00 00 00 00 00 ]; + ranges; queue-group@b1000 { #address-cells = <1>; diff --git a/arch/powerpc/boot/dts/fsl/pq3-etsec2-2.dtsi b/arch/powerpc/boot/dts/fsl/pq3-etsec2-2.dtsi index 61456c317609..cd7c318ab131 100644 --- a/arch/powerpc/boot/dts/fsl/pq3-etsec2-2.dtsi +++ b/arch/powerpc/boot/dts/fsl/pq3-etsec2-2.dtsi @@ -49,6 +49,7 @@ ethernet@b2000 { fsl,num_tx_queues = <0x8>; fsl,magic-packet; local-mac-address = [ 00 00 00 00 00 00 ]; + ranges; queue-group@b2000 { #address-cells = <1>; From d97099622356f01e3d58162628c91654c19d46bf Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Thu, 19 Mar 2015 10:28:14 +0000 Subject: [PATCH 115/236] net: ethernet: pcnet32: Setup the SRAM and NOUFLO on Am79C97{3, 5} commit 87f966d97b89774162df04d2106c6350c8fe4cb3 upstream. On a MIPS Malta board, tons of fifo underflow errors have been observed when using u-boot as bootloader instead of YAMON. The reason for that is that YAMON used to set the pcnet device to SRAM mode but u-boot does not. As a result, the default Tx threshold (64 bytes) is now too small to keep the fifo relatively used and it can result to Tx fifo underflow errors. As a result of which, it's best to setup the SRAM on supported controllers so we can always use the NOUFLO bit. Cc: Cc: Cc: Don Fry Signed-off-by: Markos Chandras Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/amd/pcnet32.c | 31 ++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index 9339cccfe05a..ad0e71c7a607 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -1516,7 +1516,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) { struct pcnet32_private *lp; int i, media; - int fdx, mii, fset, dxsuflo; + int fdx, mii, fset, dxsuflo, sram; int chip_version; char *chipname; struct net_device *dev; @@ -1553,7 +1553,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) } /* initialize variables */ - fdx = mii = fset = dxsuflo = 0; + fdx = mii = fset = dxsuflo = sram = 0; chip_version = (chip_version >> 12) & 0xffff; switch (chip_version) { @@ -1586,6 +1586,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) chipname = "PCnet/FAST III 79C973"; /* PCI */ fdx = 1; mii = 1; + sram = 1; break; case 0x2626: chipname = "PCnet/Home 79C978"; /* PCI */ @@ -1609,6 +1610,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) chipname = "PCnet/FAST III 79C975"; /* PCI */ fdx = 1; mii = 1; + sram = 1; break; case 0x2628: chipname = "PCnet/PRO 79C976"; @@ -1637,6 +1639,31 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) dxsuflo = 1; } + /* + * The Am79C973/Am79C975 controllers come with 12K of SRAM + * which we can use for the Tx/Rx buffers but most importantly, + * the use of SRAM allow us to use the BCR18:NOUFLO bit to avoid + * Tx fifo underflows. + */ + if (sram) { + /* + * The SRAM is being configured in two steps. First we + * set the SRAM size in the BCR25:SRAM_SIZE bits. According + * to the datasheet, each bit corresponds to a 512-byte + * page so we can have at most 24 pages. The SRAM_SIZE + * holds the value of the upper 8 bits of the 16-bit SRAM size. + * The low 8-bits start at 0x00 and end at 0xff. So the + * address range is from 0x0000 up to 0x17ff. Therefore, + * the SRAM_SIZE is set to 0x17. The next step is to set + * the BCR26:SRAM_BND midway through so the Tx and Rx + * buffers can share the SRAM equally. + */ + a->write_bcr(ioaddr, 25, 0x17); + a->write_bcr(ioaddr, 26, 0xc); + /* And finally enable the NOUFLO bit */ + a->write_bcr(ioaddr, 18, a->read_bcr(ioaddr, 18) | (1 << 11)); + } + dev = alloc_etherdev(sizeof(*lp)); if (!dev) { ret = -ENOMEM; From 98f97ba3e03ab1529fa099ac06f150cbe86df4d3 Mon Sep 17 00:00:00 2001 From: Ameya Palande <2ameya@gmail.com> Date: Thu, 26 Feb 2015 12:05:51 -0800 Subject: [PATCH 116/236] mfd: kempld-core: Fix callback return value check commit c8648508ebfc597058d2cd00b6c539110264a167 upstream. On success, callback function returns 0. So invert the if condition check so that we can break out of loop. Signed-off-by: Ameya Palande <2ameya@gmail.com> Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/kempld-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/kempld-core.c b/drivers/mfd/kempld-core.c index 38917a822335..2df3cbc968d1 100644 --- a/drivers/mfd/kempld-core.c +++ b/drivers/mfd/kempld-core.c @@ -629,7 +629,7 @@ static int __init kempld_init(void) if (force_device_id[0]) { for (id = kempld_dmi_table; id->matches[0].slot != DMI_NONE; id++) if (strstr(id->ident, force_device_id)) - if (id->callback && id->callback(id)) + if (id->callback && !id->callback(id)) break; if (id->matches[0].slot == DMI_NONE) return -ENODEV; From 80f018df0dbd9a233241094870c13cccb33ba088 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 13 Apr 2015 14:03:16 +0200 Subject: [PATCH 117/236] Linux 3.14.38 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c24acc0d34a1..f09e19d2cc4f 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 14 -SUBLEVEL = 37 +SUBLEVEL = 38 EXTRAVERSION = NAME = Remembering Coco From c82d759a596eb688e804ec1d1727f78d042b26fb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Apr 2014 17:08:42 +0200 Subject: [PATCH 118/236] vfs: rename: move d_move() up Move the d_move() in vfs_rename_dir() up, similarly to how it's done in vfs_rename_other(). The next patch will consolidate these two functions and this is the only structural difference between them. I'm not sure if doing the d_move() after the dput is even valid. But there may be a logical explanation for that. But moving the d_move() before the dput() (and the mutex_unlock()) should definitely not hurt. Signed-off-by: Miklos Szeredi Reviewed-by: J. Bruce Fields (cherry picked from commit de22a4c3720a96f1c2ebf12b0857b6db6a991f2c) Signed-off-by: Alex Shi --- fs/namei.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 0dd72c8e65fd..f8a6d196107f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4049,13 +4049,12 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, target->i_flags |= S_DEAD; dont_mount(new_dentry); } + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) + d_move(old_dentry, new_dentry); out: if (target) mutex_unlock(&target->i_mutex); dput(new_dentry); - if (!error) - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry,new_dentry); return error; } From 75a4b5ddf548bcfb932cc0ba8cb8eb7806ae9dbc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Apr 2014 17:08:42 +0200 Subject: [PATCH 119/236] vfs: rename: use common code for dir and non-dir There's actually very little difference between vfs_rename_dir() and vfs_rename_other() so move both inline into vfs_rename() which still stays reasonably readable. Signed-off-by: Miklos Szeredi Reviewed-by: J. Bruce Fields (cherry picked from commit bc27027a73e8b80376b51a1583ad1c7445605e8a) Signed-off-by: Alex Shi --- fs/namei.c | 217 ++++++++++++++++++++++------------------------------- 1 file changed, 90 insertions(+), 127 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f8a6d196107f..4fa9e66e92ac 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3977,7 +3977,27 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -/* +/** + * vfs_rename - rename a filesystem object + * @old_dir: parent of source + * @old_dentry: source + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: @@ -4005,137 +4025,24 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ -static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - int error = 0; - struct inode *target = new_dentry->d_inode; - unsigned max_links = new_dir->i_sb->s_max_links; - - /* - * If we are going to change the parent - check write permissions, - * we'll need to flip '..'. - */ - if (new_dir != old_dir) { - error = inode_permission(old_dentry->d_inode, MAY_WRITE); - if (error) - return error; - } - - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - return error; - - dget(new_dentry); - if (target) - mutex_lock(&target->i_mutex); - - error = -EBUSY; - if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) - goto out; - - error = -EMLINK; - if (max_links && !target && new_dir != old_dir && - new_dir->i_nlink >= max_links) - goto out; - - if (target) - shrink_dcache_parent(new_dentry); - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - goto out; - - if (target) { - target->i_flags |= S_DEAD; - dont_mount(new_dentry); - } - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); -out: - if (target) - mutex_unlock(&target->i_mutex); - dput(new_dentry); - return error; -} - -static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - struct inode **delegated_inode) -{ - struct inode *target = new_dentry->d_inode; - struct inode *source = old_dentry->d_inode; - int error; - - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - return error; - - dget(new_dentry); - lock_two_nondirectories(source, target); - - error = -EBUSY; - if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) - goto out; - - error = try_break_deleg(source, delegated_inode); - if (error) - goto out; - if (target) { - error = try_break_deleg(target, delegated_inode); - if (error) - goto out; - } - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - goto out; - - if (target) - dont_mount(new_dentry); - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); -out: - unlock_two_nondirectories(source, target); - dput(new_dentry); - return error; -} - -/** - * vfs_rename - rename a filesystem object - * @old_dir: parent of source - * @old_dentry: source - * @new_dir: parent of destination - * @new_dentry: destination - * @delegated_inode: returns an inode needing a delegation break - * - * The caller must hold multiple mutexes--see lock_rename()). - * - * If vfs_rename discovers a delegation in need of breaking at either - * the source or destination, it will return -EWOULDBLOCK and return a - * reference to the inode in delegated_inode. The caller should then - * break the delegation and retry. Because breaking a delegation may - * take a long time, the caller should drop all locks before doing - * so. - * - * Alternatively, a caller may pass NULL for delegated_inode. This may - * be appropriate for callers that expect the underlying filesystem not - * to be NFS exported. - */ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, struct inode **delegated_inode) { int error; - int is_dir = d_is_dir(old_dentry); + bool is_dir = d_is_dir(old_dentry); const unsigned char *old_name; + struct inode *source = old_dentry->d_inode; + struct inode *target = new_dentry->d_inode; + + if (source == target) + return 0; - if (old_dentry->d_inode == new_dentry->d_inode) - return 0; - error = may_delete(old_dir, old_dentry, is_dir); if (error) return error; - if (!new_dentry->d_inode) + if (!target) error = may_create(new_dir, new_dentry); else error = may_delete(new_dir, new_dentry, is_dir); @@ -4145,15 +4052,71 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old_dir->i_op->rename) return -EPERM; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + /* + * If we are going to change the parent - check write permissions, + * we'll need to flip '..'. + */ + if (is_dir && new_dir != old_dir) { + error = inode_permission(source, MAY_WRITE); + if (error) + return error; + } - if (is_dir) - error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); - else - error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode); + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + return error; + + old_name = fsnotify_oldname_init(old_dentry->d_name.name); + dget(new_dentry); + if (!is_dir) + lock_two_nondirectories(source, target); + else if (target) + mutex_lock(&target->i_mutex); + + error = -EBUSY; + if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) + goto out; + + if (is_dir) { + unsigned max_links = new_dir->i_sb->s_max_links; + + error = -EMLINK; + if (max_links && !target && new_dir != old_dir && + new_dir->i_nlink >= max_links) + goto out; + + if (target) + shrink_dcache_parent(new_dentry); + } else { + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; + if (target) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + } + } + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + goto out; + + if (target) { + if (is_dir) + target->i_flags |= S_DEAD; + dont_mount(new_dentry); + } + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) + d_move(old_dentry, new_dentry); +out: + if (!is_dir) + unlock_two_nondirectories(source, target); + else if (target) + mutex_unlock(&target->i_mutex); + dput(new_dentry); if (!error) fsnotify_move(old_dir, new_dir, old_name, is_dir, - new_dentry->d_inode, old_dentry); + target, old_dentry); fsnotify_oldname_free(old_name); return error; From 148622f8708456dda3262bbbe08742a962a0d667 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Apr 2014 17:08:42 +0200 Subject: [PATCH 120/236] vfs: add renameat2 syscall Add new renameat2 syscall, which is the same as renameat with an added flags argument. Pass flags to vfs_rename() and to i_op->rename() as well. Signed-off-by: Miklos Szeredi Reviewed-by: J. Bruce Fields (cherry picked from commit 520c8b16505236fc82daa352e6c5e73cd9870cff) Signed-off-by: Alex Shi --- Documentation/filesystems/Locking | 6 +++- Documentation/filesystems/vfs.txt | 16 +++++++++ arch/x86/syscalls/syscall_64.tbl | 1 + .../lustre/include/linux/lustre_compat25.h | 4 +-- .../staging/lustre/lustre/lvfs/lvfs_linux.c | 2 +- fs/cachefiles/namei.c | 2 +- fs/ecryptfs/inode.c | 2 +- fs/namei.c | 34 +++++++++++++++---- fs/nfsd/vfs.c | 2 +- include/linux/fs.h | 4 ++- 10 files changed, 58 insertions(+), 15 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 5b0c083d7c0e..f424e0e5b46b 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -47,6 +47,8 @@ prototypes: int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); + int (*rename2) (struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); void * (*follow_link) (struct dentry *, struct nameidata *); void (*put_link) (struct dentry *, struct nameidata *, void *); @@ -78,6 +80,7 @@ mkdir: yes unlink: yes (both) rmdir: yes (both) (see below) rename: yes (all) (see below) +rename2: yes (all) (see below) readlink: no follow_link: no put_link: no @@ -96,7 +99,8 @@ tmpfile: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. - cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. + cross-directory ->rename() and rename2() has (per-superblock) +->s_vfs_rename_sem. See Documentation/filesystems/directory-locking for more detailed discussion of the locking scheme for directory operations. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index c53784c119c8..94eb86287bcb 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -347,6 +347,8 @@ struct inode_operations { int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); + int (*rename2) (struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); void * (*follow_link) (struct dentry *, struct nameidata *); void (*put_link) (struct dentry *, struct nameidata *, void *); @@ -414,6 +416,20 @@ otherwise noted. rename: called by the rename(2) system call to rename the object to have the parent and name given by the second inode and dentry. + rename2: this has an additional flags argument compared to rename. + If no flags are supported by the filesystem then this method + need not be implemented. If some flags are supported then the + filesystem must return -EINVAL for any unsupported or unknown + flags. Currently the following flags are implemented: + (1) RENAME_NOREPLACE: this flag indicates that if the target + of the rename exists the rename should fail with -EEXIST + instead of replacing the target. The VFS already checks for + existence, so for local filesystems the RENAME_NOREPLACE + implementation is equivalent to plain rename. + (2) RENAME_EXCHANGE: exchange source and target. Both must + exist; this is checked by the VFS. Unlike plain rename, + source and target may be of different type. + readlink: called by the readlink(2) system call. Only required if you want to support reading symbolic links diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 7f517ca19d32..ec255a1646d2 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -322,6 +322,7 @@ 313 common finit_module sys_finit_module 314 common sched_setattr sys_sched_setattr 315 common sched_getattr sys_sched_getattr +316 common renameat2 sys_renameat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h index eefdb8d061b1..81cc7a0134bb 100644 --- a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h +++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h @@ -105,8 +105,8 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, #define ll_vfs_unlink(inode,entry,mnt) vfs_unlink(inode,entry) #define ll_vfs_mknod(dir,entry,mnt,mode,dev) vfs_mknod(dir,entry,mode,dev) #define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry) -#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1,delegated_inode) \ - vfs_rename(old,old_dir,new,new_dir,delegated_inode) +#define ll_vfs_rename(old, old_dir, mnt, new, new_dir, mnt1) \ + vfs_rename(old, old_dir, new, new_dir, NULL, 0) #define cfs_bio_io_error(a,b) bio_io_error((a)) #define cfs_bio_endio(a,b,c) bio_endio((a),(c)) diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c index 428ffd8c37b7..d50822be3230 100644 --- a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c +++ b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c @@ -223,7 +223,7 @@ int lustre_rename(struct dentry *dir, struct vfsmount *mnt, GOTO(put_old, err = PTR_ERR(dchild_new)); err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, - dir->d_inode, dchild_new, mnt, NULL); + dir->d_inode, dchild_new, mnt); dput(dchild_new); put_old: diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ca65f39dc8dc..31088a969351 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -396,7 +396,7 @@ try_again: cachefiles_io_error(cache, "Rename security error %d", ret); } else { ret = vfs_rename(dir->d_inode, rep, - cache->graveyard->d_inode, grave, NULL); + cache->graveyard->d_inode, grave, NULL, 0); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a85ceb7c91bc..57ee4c53b4f8 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -641,7 +641,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, } rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry, - NULL); + NULL, 0); if (rc) goto out_lock; if (target_inode) diff --git a/fs/namei.c b/fs/namei.c index 4fa9e66e92ac..5134f8c92b25 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3984,6 +3984,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * @new_dir: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break + * @flags: rename flags * * The caller must hold multiple mutexes--see lock_rename()). * @@ -4027,7 +4028,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname */ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, - struct inode **delegated_inode) + struct inode **delegated_inode, unsigned int flags) { int error; bool is_dir = d_is_dir(old_dentry); @@ -4052,6 +4053,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old_dir->i_op->rename) return -EPERM; + if (flags && !old_dir->i_op->rename2) + return -EINVAL; + /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. @@ -4097,7 +4101,13 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } } - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (!flags) { + error = old_dir->i_op->rename(old_dir, old_dentry, + new_dir, new_dentry); + } else { + error = old_dir->i_op->rename2(old_dir, old_dentry, + new_dir, new_dentry, flags); + } if (error) goto out; @@ -4122,8 +4132,8 @@ out: return error; } -SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname) +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) { struct dentry *old_dir, *new_dir; struct dentry *old_dentry, *new_dentry; @@ -4135,6 +4145,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, unsigned int lookup_flags = 0; bool should_retry = false; int error; + + if (flags) + return -EINVAL; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { @@ -4206,8 +4220,8 @@ retry_deleg: if (error) goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry, - &delegated_inode); + new_dir->d_inode, new_dentry, + &delegated_inode, flags); exit5: dput(new_dentry); exit4: @@ -4237,9 +4251,15 @@ exit: return error; } +SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + return sys_renameat2(olddfd, oldname, newdfd, newname, 0); +} + SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); + return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index eea5ad188984..464f813a2544 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1703,7 +1703,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) diff --git a/include/linux/fs.h b/include/linux/fs.h index 23b2a35d712e..3b3670e97da6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1460,7 +1460,7 @@ extern int vfs_symlink(struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); -extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **); +extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); /* * VFS dentry helper functions. @@ -1571,6 +1571,8 @@ struct inode_operations { int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); + int (*rename2) (struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); From f1bdf915fa37836cc39f76e86c65521504deb7e1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Apr 2014 17:08:43 +0200 Subject: [PATCH 121/236] vfs: add RENAME_NOREPLACE flag If this flag is specified and the target of the rename exists then the rename syscall fails with EEXIST. The VFS does the existence checking, so it is trivial to enable for most local filesystems. This patch only enables it in ext4. For network filesystems the VFS check is not enough as there may be a race between a remote create and the rename, so these filesystems need to handle this flag in their ->rename() implementations to ensure atomicity. Andy writes about why this is useful: "The trivial answer: to eliminate the race condition from 'mv -i'. Another answer: there's a common pattern to atomically create a file with contents: open a temporary file, write to it, optionally fsync it, close it, then link(2) it to the final name, then unlink the temporary file. The reason to use link(2) is because it won't silently clobber the destination. This is annoying: - It requires an extra system call that shouldn't be necessary. - It doesn't work on (IMO sensible) filesystems that don't support hard links (e.g. vfat). - It's not atomic -- there's an intermediate state where both files exist. - It's ugly. The new rename flag will make this totally sensible. To be fair, on new enough kernels, you can also use O_TMPFILE and linkat to achieve the same thing even more cleanly." Suggested-by: Andy Lutomirski Signed-off-by: Miklos Szeredi Reviewed-by: J. Bruce Fields (cherry picked from commit 0a7c3937a1f23f8cb5fc77ae01661e9968a51d0c) Signed-off-by: Alex Shi --- fs/ext4/namei.c | 11 +++++++++++ fs/namei.c | 31 ++++++++++++++++++------------- include/uapi/linux/fs.h | 2 ++ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2dcbfb6245d8..f0598d6e5722 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3191,6 +3191,16 @@ end_rename: return retval; } +static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + return ext4_rename(old_dir, old_dentry, new_dir, new_dentry); +} + /* * directories can handle most operations... */ @@ -3205,6 +3215,7 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, .rename = ext4_rename, + .rename2 = ext4_rename2, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/namei.c b/fs/namei.c index 5134f8c92b25..3f2cc3c5a5bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4146,7 +4146,7 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, bool should_retry = false; int error; - if (flags) + if (flags & ~RENAME_NOREPLACE) return -EINVAL; retry: @@ -4172,6 +4172,8 @@ retry: goto exit2; new_dir = newnd.path.dentry; + if (flags & RENAME_NOREPLACE) + error = -EEXIST; if (newnd.last_type != LAST_NORM) goto exit2; @@ -4194,22 +4196,25 @@ retry_deleg: error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; - /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!d_is_dir(old_dentry)) { - error = -ENOTDIR; - if (oldnd.last.name[oldnd.last.len]) - goto exit4; - if (newnd.last.name[newnd.last.len]) - goto exit4; - } - /* source should not be ancestor of target */ - error = -EINVAL; - if (old_dentry == trap) - goto exit4; new_dentry = lookup_hash(&newnd); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; + error = -EEXIST; + if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) + goto exit5; + /* unless the source is a directory trailing slashes give -ENOTDIR */ + if (!d_is_dir(old_dentry)) { + error = -ENOTDIR; + if (oldnd.last.name[oldnd.last.len]) + goto exit5; + if (newnd.last.name[newnd.last.len]) + goto exit5; + } + /* source should not be ancestor of target */ + error = -EINVAL; + if (old_dentry == trap) + goto exit5; /* target should not be an ancestor of source */ error = -ENOTEMPTY; if (new_dentry == trap) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 6c28b61bb690..9250f4dd7d96 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -35,6 +35,8 @@ #define SEEK_HOLE 4 /* seek to the next hole */ #define SEEK_MAX SEEK_HOLE +#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ + struct fstrim_range { __u64 start; __u64 len; From 5371fcd46e9276f4216e329ba2714d595a384a1f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Apr 2014 17:08:43 +0200 Subject: [PATCH 122/236] security: add flags to rename hooks Add flags to security_path_rename() and security_inode_rename() hooks. Signed-off-by: Miklos Szeredi Reviewed-by: J. Bruce Fields (cherry picked from commit 0b3974eb04c4874e85fa1d4fc70450d12f28611d) Signed-off-by: Alex Shi --- fs/cachefiles/namei.c | 2 +- fs/namei.c | 5 +++-- include/linux/security.h | 12 ++++++++---- security/security.c | 6 ++++-- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 31088a969351..6494d9f673aa 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -391,7 +391,7 @@ try_again: path.dentry = dir; path_to_graveyard.mnt = cache->mnt; path_to_graveyard.dentry = cache->graveyard; - ret = security_path_rename(&path, rep, &path_to_graveyard, grave); + ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0); if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { diff --git a/fs/namei.c b/fs/namei.c index 3f2cc3c5a5bf..137a7b8c8a04 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4066,7 +4066,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, return error; } - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); if (error) return error; @@ -4221,7 +4222,7 @@ retry_deleg: goto exit5; error = security_path_rename(&oldnd.path, old_dentry, - &newnd.path, new_dentry); + &newnd.path, new_dentry, flags); if (error) goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, diff --git a/include/linux/security.h b/include/linux/security.h index 2fc42d191f79..6478ce3252c7 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1793,7 +1793,8 @@ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int security_inode_rmdir(struct inode *dir, struct dentry *dentry); int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev); int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry); + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); int security_inode_readlink(struct dentry *dentry); int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); int security_inode_permission(struct inode *inode, int mask); @@ -2161,7 +2162,8 @@ static inline int security_inode_mknod(struct inode *dir, static inline int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry) + struct dentry *new_dentry, + unsigned int flags) { return 0; } @@ -2955,7 +2957,8 @@ int security_path_symlink(struct path *dir, struct dentry *dentry, int security_path_link(struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry); int security_path_rename(struct path *old_dir, struct dentry *old_dentry, - struct path *new_dir, struct dentry *new_dentry); + struct path *new_dir, struct dentry *new_dentry, + unsigned int flags); int security_path_chmod(struct path *path, umode_t mode); int security_path_chown(struct path *path, kuid_t uid, kgid_t gid); int security_path_chroot(struct path *path); @@ -3003,7 +3006,8 @@ static inline int security_path_link(struct dentry *old_dentry, static inline int security_path_rename(struct path *old_dir, struct dentry *old_dentry, struct path *new_dir, - struct dentry *new_dentry) + struct dentry *new_dentry, + unsigned int flags) { return 0; } diff --git a/security/security.c b/security/security.c index 919cad93ac82..284fbc99aa9d 100644 --- a/security/security.c +++ b/security/security.c @@ -433,7 +433,8 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir, } int security_path_rename(struct path *old_dir, struct dentry *old_dentry, - struct path *new_dir, struct dentry *new_dentry) + struct path *new_dir, struct dentry *new_dentry, + unsigned int flags) { if (unlikely(IS_PRIVATE(old_dentry->d_inode) || (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode)))) @@ -524,7 +525,8 @@ int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, } int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { if (unlikely(IS_PRIVATE(old_dentry->d_inode) || (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode)))) From c3ae3e8840e6b555949cda895d885ad91c5e3f24 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Apr 2014 17:08:43 +0200 Subject: [PATCH 123/236] vfs: add cross-rename If flags contain RENAME_EXCHANGE then exchange source and destination files. There's no restriction on the type of the files; e.g. a directory can be exchanged with a symlink. Signed-off-by: Miklos Szeredi Reviewed-by: Jan Kara Reviewed-by: J. Bruce Fields (cherry picked from commit da1ce0670c14d8380e423a3239e562a1dc15fa9e) Signed-off-by: Alex Shi --- fs/dcache.c | 50 +++++++++++++++---- fs/namei.c | 104 +++++++++++++++++++++++++++------------- include/linux/dcache.h | 1 + include/uapi/linux/fs.h | 1 + security/security.c | 16 +++++++ 5 files changed, 131 insertions(+), 41 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 436612777203..e3c556351810 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2481,12 +2481,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target) dentry->d_name.name = dentry->d_iname; } else { /* - * Both are internal. Just copy target to dentry + * Both are internal. */ - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); - dentry->d_name.len = target->d_name.len; - return; + unsigned int i; + BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); + for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { + swap(((long *) &dentry->d_iname)[i], + ((long *) &target->d_iname)[i]); + } } } swap(dentry->d_name.len, target->d_name.len); @@ -2543,13 +2545,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * __d_move - move a dentry * @dentry: entry to move * @target: new dentry + * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ -static void __d_move(struct dentry * dentry, struct dentry * target) +static void __d_move(struct dentry *dentry, struct dentry *target, + bool exchange) { if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2571,8 +2575,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target) __d_drop(dentry); __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); - /* Unhash the target: dput() will then get rid of it */ + /* + * Unhash the target (d_delete() is not usable here). If exchanging + * the two dentries, then rehash onto the other's hash queue. + */ __d_drop(target); + if (exchange) { + __d_rehash(target, + d_hash(dentry->d_parent, dentry->d_name.hash)); + } list_del(&dentry->d_u.d_child); list_del(&target->d_u.d_child); @@ -2599,6 +2610,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target) write_seqcount_end(&dentry->d_seq); dentry_unlock_parents_for_move(dentry, target); + if (exchange) + fsnotify_d_move(target); spin_unlock(&target->d_lock); fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); @@ -2616,11 +2629,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target) void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); - __d_move(dentry, target); + __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); +/* + * d_exchange - exchange two dentries + * @dentry1: first dentry + * @dentry2: second dentry + */ +void d_exchange(struct dentry *dentry1, struct dentry *dentry2) +{ + write_seqlock(&rename_lock); + + WARN_ON(!dentry1->d_inode); + WARN_ON(!dentry2->d_inode); + WARN_ON(IS_ROOT(dentry1)); + WARN_ON(IS_ROOT(dentry2)); + + __d_move(dentry1, dentry2, true); + + write_sequnlock(&rename_lock); +} + /** * d_ancestor - search for an ancestor * @p1: ancestor dentry @@ -2668,7 +2700,7 @@ static struct dentry *__d_unalias(struct inode *inode, m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: if (likely(!d_mountpoint(alias))) { - __d_move(alias, dentry); + __d_move(alias, dentry, false); ret = alias; } out_err: diff --git a/fs/namei.c b/fs/namei.c index 137a7b8c8a04..5e41971c15e1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4035,6 +4035,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, const unsigned char *old_name; struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; + bool new_is_dir = false; + unsigned max_links = new_dir->i_sb->s_max_links; if (source == target) return 0; @@ -4043,10 +4045,16 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - if (!target) + if (!target) { error = may_create(new_dir, new_dentry); - else - error = may_delete(new_dir, new_dentry, is_dir); + } else { + new_is_dir = d_is_dir(new_dentry); + + if (!(flags & RENAME_EXCHANGE)) + error = may_delete(new_dir, new_dentry, is_dir); + else + error = may_delete(new_dir, new_dentry, new_is_dir); + } if (error) return error; @@ -4060,10 +4068,17 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ - if (is_dir && new_dir != old_dir) { - error = inode_permission(source, MAY_WRITE); - if (error) - return error; + if (new_dir != old_dir) { + if (is_dir) { + error = inode_permission(source, MAY_WRITE); + if (error) + return error; + } + if ((flags & RENAME_EXCHANGE) && new_is_dir) { + error = inode_permission(target, MAY_WRITE); + if (error) + return error; + } } error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, @@ -4073,7 +4088,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_name = fsnotify_oldname_init(old_dentry->d_name.name); dget(new_dentry); - if (!is_dir) + if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); else if (target) mutex_lock(&target->i_mutex); @@ -4082,25 +4097,25 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) goto out; - if (is_dir) { - unsigned max_links = new_dir->i_sb->s_max_links; - + if (max_links && new_dir != old_dir) { error = -EMLINK; - if (max_links && !target && new_dir != old_dir && - new_dir->i_nlink >= max_links) + if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) goto out; - - if (target) - shrink_dcache_parent(new_dentry); - } else { + if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && + old_dir->i_nlink >= max_links) + goto out; + } + if (is_dir && !(flags & RENAME_EXCHANGE) && target) + shrink_dcache_parent(new_dentry); + if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) goto out; - if (target) { - error = try_break_deleg(target, delegated_inode); - if (error) - goto out; - } + } + if (target && !new_is_dir) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; } if (!flags) { error = old_dir->i_op->rename(old_dir, old_dentry, @@ -4112,22 +4127,31 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) goto out; - if (target) { + if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) target->i_flags |= S_DEAD; dont_mount(new_dentry); } - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { + if (!(flags & RENAME_EXCHANGE)) + d_move(old_dentry, new_dentry); + else + d_exchange(old_dentry, new_dentry); + } out: - if (!is_dir) + if (!is_dir || (flags & RENAME_EXCHANGE)) unlock_two_nondirectories(source, target); else if (target) mutex_unlock(&target->i_mutex); dput(new_dentry); - if (!error) + if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, - target, old_dentry); + !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); + if (flags & RENAME_EXCHANGE) { + fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, + new_is_dir, NULL, new_dentry); + } + } fsnotify_oldname_free(old_name); return error; @@ -4147,7 +4171,10 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, bool should_retry = false; int error; - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) return -EINVAL; retry: @@ -4184,7 +4211,8 @@ retry: oldnd.flags &= ~LOOKUP_PARENT; newnd.flags &= ~LOOKUP_PARENT; - newnd.flags |= LOOKUP_RENAME_TARGET; + if (!(flags & RENAME_EXCHANGE)) + newnd.flags |= LOOKUP_RENAME_TARGET; retry_deleg: trap = lock_rename(new_dir, old_dir); @@ -4204,12 +4232,23 @@ retry_deleg: error = -EEXIST; if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) goto exit5; + if (flags & RENAME_EXCHANGE) { + error = -ENOENT; + if (d_is_negative(new_dentry)) + goto exit5; + + if (!d_is_dir(new_dentry)) { + error = -ENOTDIR; + if (newnd.last.name[newnd.last.len]) + goto exit5; + } + } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (oldnd.last.name[oldnd.last.len]) goto exit5; - if (newnd.last.name[newnd.last.len]) + if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len]) goto exit5; } /* source should not be ancestor of target */ @@ -4217,7 +4256,8 @@ retry_deleg: if (old_dentry == trap) goto exit5; /* target should not be an ancestor of source */ - error = -ENOTEMPTY; + if (!(flags & RENAME_EXCHANGE)) + error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 3b50cac7ccb3..3b9bfdb83ba6 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -308,6 +308,7 @@ extern void dentry_update_name_case(struct dentry *, struct qstr *); /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); +extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 9250f4dd7d96..ca1a11bb4443 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -36,6 +36,7 @@ #define SEEK_MAX SEEK_HOLE #define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ +#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ struct fstrim_range { __u64 start; diff --git a/security/security.c b/security/security.c index 284fbc99aa9d..8b774f362a3d 100644 --- a/security/security.c +++ b/security/security.c @@ -439,6 +439,14 @@ int security_path_rename(struct path *old_dir, struct dentry *old_dentry, if (unlikely(IS_PRIVATE(old_dentry->d_inode) || (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode)))) return 0; + + if (flags & RENAME_EXCHANGE) { + int err = security_ops->path_rename(new_dir, new_dentry, + old_dir, old_dentry); + if (err) + return err; + } + return security_ops->path_rename(old_dir, old_dentry, new_dir, new_dentry); } @@ -531,6 +539,14 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(IS_PRIVATE(old_dentry->d_inode) || (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode)))) return 0; + + if (flags & RENAME_EXCHANGE) { + int err = security_ops->inode_rename(new_dir, new_dentry, + old_dir, old_dentry); + if (err) + return err; + } + return security_ops->inode_rename(old_dir, old_dentry, new_dir, new_dentry); } From a756aea51bb937b6a150c4d1cfff66824285491d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:30 +0200 Subject: [PATCH 124/236] fs: call rename2 if exists Christoph Hellwig suggests: 1) make vfs_rename call ->rename2 if it exists instead of ->rename 2) switch all filesystems that you're adding NOREPLACE support for to use ->rename2 3) see how many ->rename instances we'll have left after a few iterations of 2. Signed-off-by: Miklos Szeredi Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro (cherry picked from commit 7177a9c4b509eb357cc450256bc3cf39f1a1e639) Signed-off-by: Alex Shi Conflicts: fs/fuse/dir.c --- fs/ext4/namei.c | 1 - fs/fuse/dir.c | 32 +++++++++++++++++++++++++++++++- fs/namei.c | 5 +++-- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f0598d6e5722..98de4b391e79 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3214,7 +3214,6 @@ const struct inode_operations ext4_dir_inode_operations = { .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, - .rename = ext4_rename, .rename2 = ext4_rename2, .setattr = ext4_setattr, .setxattr = generic_setxattr, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 342f0239fcbf..d53a7d91eb07 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -796,6 +796,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, return err; } +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); + } + + return err; +} + static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -1882,7 +1912,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .symlink = fuse_symlink, .unlink = fuse_unlink, .rmdir = fuse_rmdir, - .rename = fuse_rename, + .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, diff --git a/fs/namei.c b/fs/namei.c index 5e41971c15e1..fa7dd2d92caa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4058,7 +4058,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - if (!old_dir->i_op->rename) + if (!old_dir->i_op->rename && !old_dir->i_op->rename2) return -EPERM; if (flags && !old_dir->i_op->rename2) @@ -4117,10 +4117,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) goto out; } - if (!flags) { + if (!old_dir->i_op->rename2) { error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); } else { + WARN_ON(old_dir->i_op->rename != NULL); error = old_dir->i_op->rename2(old_dir, old_dentry, new_dir, new_dentry, flags); } From c5948f26c2de38f3b475d10cfb6695245f52b6ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Feb 2014 04:43:32 -0500 Subject: [PATCH 125/236] get rid of pointless checks for NULL ->i_op Signed-off-by: Al Viro (cherry picked from commit 627bf81ac625f05060db033a0f3791521ad7bd79) Signed-off-by: Alex Shi --- fs/cachefiles/bind.c | 1 - fs/cachefiles/namei.c | 3 +-- security/integrity/evm/evm_crypto.c | 2 +- security/integrity/evm/evm_main.c | 2 +- security/tomoyo/realpath.c | 4 ++-- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 622f4696e484..5b99bafc31d1 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; if (!root->d_inode || - !root->d_inode->i_op || !root->d_inode->i_op->lookup || !root->d_inode->i_op->mkdir || !root->d_inode->i_op->setxattr || diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 6494d9f673aa..c0a681705104 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!subdir->d_inode->i_op || - !subdir->d_inode->i_op->setxattr || + if (!subdir->d_inode->i_op->setxattr || !subdir->d_inode->i_op->getxattr || !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 3bab89eb21d6..e90ab0e20db8 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -137,7 +137,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry, int error; int size; - if (!inode->i_op || !inode->i_op->getxattr) + if (!inode->i_op->getxattr) return -EOPNOTSUPP; desc = init_desc(type); if (IS_ERR(desc)) diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 7e71e066198f..690cd632bd5b 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -62,7 +62,7 @@ static int evm_find_protected_xattrs(struct dentry *dentry) int error; int count = 0; - if (!inode->i_op || !inode->i_op->getxattr) + if (!inode->i_op->getxattr) return -EOPNOTSUPP; for (xattr = evm_config_xattrnames; *xattr != NULL; xattr++) { diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index 80a09c37cac8..a3386d119425 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -173,7 +173,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, * Use filesystem name if filesystem does not support rename() * operation. */ - if (inode->i_op && !inode->i_op->rename) + if (!inode->i_op->rename) goto prepend_filesystem_name; } /* Prepend device name. */ @@ -282,7 +282,7 @@ char *tomoyo_realpath_from_path(struct path *path) * Get local name for filesystems without rename() operation * or dentry without vfsmount. */ - if (!path->mnt || (inode->i_op && !inode->i_op->rename)) + if (!path->mnt || !inode->i_op->rename) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); /* Get absolute name for the rest. */ From bb9934859afca84e19c3213060f2f2eeaff5d6a1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 20 Aug 2014 14:14:04 +0900 Subject: [PATCH 126/236] tomoyo: Fix pathname calculation breakage. Commit 7177a9c4b509 ("fs: call rename2 if exists") changed "struct inode_operations"->rename == NULL if "struct inode_operations"->rename2 != NULL . TOMOYO needs to check for both ->rename and ->rename2 , or a system on (e.g.) ext4 filesystem won't boot. Signed-off-by: Tetsuo Handa Signed-off-by: Serge E. Hallyn (cherry picked from commit 8fe7a268b18ebc89203c766b020b9e32f1cfeebf) Signed-off-by: Alex Shi --- security/tomoyo/realpath.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index a3386d119425..bed745c8b1a3 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -173,7 +173,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, * Use filesystem name if filesystem does not support rename() * operation. */ - if (!inode->i_op->rename) + if (!inode->i_op->rename && !inode->i_op->rename2) goto prepend_filesystem_name; } /* Prepend device name. */ @@ -282,7 +282,8 @@ char *tomoyo_realpath_from_path(struct path *path) * Get local name for filesystems without rename() operation * or dentry without vfsmount. */ - if (!path->mnt || !inode->i_op->rename) + if (!path->mnt || + (!inode->i_op->rename && !inode->i_op->rename2)) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); /* Get absolute name for the rest. */ From 0750d56323044df86224bb9aa3d0830bada1cb81 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 13:42:45 -0400 Subject: [PATCH 127/236] new helper: readlink_copy() Signed-off-by: Al Viro (cherry picked from commit 5d826c847b34de6415b4f1becd88a57ff619af50) Signed-off-by: Alex Shi Conflicts: fs/namei.c --- fs/namei.c | 12 ++++-------- fs/proc/namespaces.c | 14 ++++---------- fs/proc/self.c | 2 +- fs/xfs/xfs_ioctl.c | 28 +--------------------------- include/linux/fs.h | 2 +- 5 files changed, 11 insertions(+), 47 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index fa7dd2d92caa..ef28dcd3b719 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4309,11 +4309,9 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link) { - int len; - - len = PTR_ERR(link); + int len = PTR_ERR(link); if (IS_ERR(link)) goto out; @@ -4342,7 +4340,7 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(cookie)) return PTR_ERR(cookie); - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + res = readlink_copy(buffer, buflen, nd_get_link(&nd)); if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, &nd, cookie); return res; @@ -4366,8 +4364,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); if (page) { kunmap(page); page_cache_release(page); @@ -4463,7 +4460,6 @@ EXPORT_SYMBOL(vfs_link); EXPORT_SYMBOL(vfs_mkdir); EXPORT_SYMBOL(vfs_mknod); EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); EXPORT_SYMBOL(vfs_rename); EXPORT_SYMBOL(vfs_rmdir); EXPORT_SYMBOL(vfs_symlink); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 9ae46b87470d..89026095f2b5 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl struct task_struct *task; void *ns; char name[50]; - int len = -EACCES; + int res = -EACCES; task = get_proc_task(inode); if (!task) @@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - len = -ENOENT; + res = -ENOENT; ns = ns_ops->get(task); if (!ns) goto out_put_task; snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - len = strlen(name); - - if (len > buflen) - len = buflen; - if (copy_to_user(buffer, name, len)) - len = -EFAULT; - + res = readlink_copy(buffer, buflen, name); ns_ops->put(ns); out_put_task: put_task_struct(task); out: - return len; + return res; } static const struct inode_operations proc_ns_link_inode_operations = { diff --git a/fs/proc/self.c b/fs/proc/self.c index ffeb202ec942..4348bb8907c2 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, if (!tgid) return -ENOENT; sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); + return readlink_copy(buffer, buflen, tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 78e62cc471c5..6152cbe353e8 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -271,32 +271,6 @@ xfs_open_by_handle( return error; } -/* - * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's - * unused first argument. - */ -STATIC int -do_readlink( - char __user *buffer, - int buflen, - const char *link) -{ - int len; - - len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; - - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; - out: - return len; -} - - int xfs_readlink_by_handle( struct file *parfilp, @@ -334,7 +308,7 @@ xfs_readlink_by_handle( error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; - error = do_readlink(hreq->ohandle, olen, link); + error = readlink_copy(hreq->ohandle, olen, link); if (error) goto out_kfree; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3b3670e97da6..29170bfaae54 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2566,7 +2566,7 @@ extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); extern void *page_follow_link_light(struct dentry *, struct nameidata *); extern void page_put_link(struct dentry *, struct nameidata *, void *); From 6c1a7bfb6a5ac8cf057fd191acd8f28b23c60dd4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 12:54:25 -0400 Subject: [PATCH 128/236] lustre: generic_readlink() is just fine there, TYVM... Signed-off-by: Al Viro (cherry picked from commit 4efcc9ffcd4fc53f1f7de539842cdffa1f8e5ecc) Signed-off-by: Alex Shi --- drivers/staging/lustre/lustre/llite/symlink.c | 23 +------------------ 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c index ab06891f7fc7..80d48b5ae247 100644 --- a/drivers/staging/lustre/lustre/llite/symlink.c +++ b/drivers/staging/lustre/lustre/llite/symlink.c @@ -115,27 +115,6 @@ failed: return rc; } -static int ll_readlink(struct dentry *dentry, char *buffer, int buflen) -{ - struct inode *inode = dentry->d_inode; - struct ptlrpc_request *request; - char *symname; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op\n"); - - ll_inode_size_lock(inode); - rc = ll_readlink_internal(inode, &request, &symname); - if (rc) - GOTO(out, rc); - - rc = vfs_readlink(dentry, buffer, buflen, symname); - out: - ptlrpc_req_finished(request); - ll_inode_size_unlock(inode); - return rc; -} - static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -175,7 +154,7 @@ static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cooki } struct inode_operations ll_fast_symlink_inode_operations = { - .readlink = ll_readlink, + .readlink = generic_readlink, .setattr = ll_setattr, .follow_link = ll_follow_link, .put_link = ll_put_link, From 71732036259eac84c4b4add5b7503a0f01905bc4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: [PATCH 129/236] vfs: add i_op->dentry_open() Add a new inode operation i_op->dentry_open(). This is for stacked filesystems that want to return a struct file from a different filesystem. Signed-off-by: Miklos Szeredi (cherry picked from commit 4aa7c6346be395bdf776f82bbb2e3e2bc60bdd2b) Signed-off-by: Alex Shi --- Documentation/filesystems/Locking | 2 ++ Documentation/filesystems/vfs.txt | 7 +++++++ fs/namei.c | 9 ++++++--- fs/open.c | 23 +++++++++++++++++++++-- include/linux/fs.h | 4 ++++ 5 files changed, 40 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f424e0e5b46b..821c8f4d359f 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -67,6 +67,7 @@ prototypes: struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); locking rules: all may block @@ -96,6 +97,7 @@ fiemap: no update_time: no atomic_open: yes tmpfile: no +dentry_open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 94eb86287bcb..5d06f2a19798 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,6 +364,7 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -697,6 +698,12 @@ struct address_space_operations { but instead uses bmap to find out where the blocks in the file are and uses those addresses directly. + dentry_open: *WARNING: probably going away soon, do not use!* This is an + alternative to f_op->open(), the difference is that this method may open + a file not necessarily originating from the same filesystem as the one + i_op->open() was called on. It may be useful for stacking filesystems + which want to allow native I/O directly on underlying files. + invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed diff --git a/fs/namei.c b/fs/namei.c index ef28dcd3b719..b987228e6afb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3037,9 +3037,12 @@ finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); - if (error) { + + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file, current_cred()); + if (!error) { + *opened |= FILE_OPENED; + } else { if (error == -EOPENSTALE) goto stale_open; goto out; diff --git a/fs/open.c b/fs/open.c index 2ed7325f713e..c92c6ef8bf6d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -822,8 +822,7 @@ struct file *dentry_open(const struct path *path, int flags, f = get_empty_filp(); if (!IS_ERR(f)) { f->f_flags = flags; - f->f_path = *path; - error = do_dentry_open(f, NULL, cred); + error = vfs_open(path, f, cred); if (!error) { /* from now on we need fput() to dispose of f */ error = open_check_o_direct(f); @@ -840,6 +839,26 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * vfs_open - open the file at the given path + * @path: path to open + * @filp: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *filp, + const struct cred *cred) +{ + struct inode *inode = path->dentry->d_inode; + + if (inode->i_op->dentry_open) + return inode->i_op->dentry_open(path->dentry, filp, cred); + else { + filp->f_path = *path; + return do_dentry_open(filp, NULL, cred); + } +} +EXPORT_SYMBOL(vfs_open); + static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index 29170bfaae54..ce38acf2acf0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1587,6 +1587,9 @@ struct inode_operations { umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); int (*set_acl)(struct inode *, struct posix_acl *, int); + + /* WARNING: probably going away soon, do not use! */ + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); } ____cacheline_aligned; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, @@ -2081,6 +2084,7 @@ extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); +extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); From 5c45fc6a9c14be17ae4d203d35fe82c6b4575de4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: [PATCH 130/236] vfs: export do_splice_direct() to modules Export do_splice_direct() to modules. Needed by overlay filesystem. Signed-off-by: Miklos Szeredi (cherry picked from commit 1c118596a7682912106c80007102ce0184c77780) Signed-off-by: Alex Shi --- fs/internal.h | 6 ------ fs/splice.c | 1 + include/linux/fs.h | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 465742407466..873cdf544d54 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -134,12 +134,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); extern int rw_verify_area(int, struct file *, const loff_t *, size_t); -/* - * splice.c - */ -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); - /* * pipe.c */ diff --git a/fs/splice.c b/fs/splice.c index 12028fa41def..ffb92b9d34ba 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1327,6 +1327,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +EXPORT_SYMBOL(do_splice_direct); static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, diff --git a/include/linux/fs.h b/include/linux/fs.h index ce38acf2acf0..5bcbbf3a0f00 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2506,6 +2506,9 @@ extern ssize_t generic_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *, size_t len, unsigned int flags); +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); From 96bd0c53e4204392144978e5a40f4699570be22b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: [PATCH 131/236] vfs: export __inode_permission() to modules We need to be able to check inode permissions (but not filesystem implied permissions) for stackable filesystems. Expose this interface for overlayfs. Signed-off-by: Miklos Szeredi (cherry picked from commit bd5d08569cc379f8366663a61558a9ce17c2e460) Signed-off-by: Alex Shi --- fs/internal.h | 1 - fs/namei.c | 1 + include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/internal.h b/fs/internal.h index 873cdf544d54..dd41b12c13f1 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -42,7 +42,6 @@ extern void __init chrdev_init(void); /* * namei.c */ -extern int __inode_permission(struct inode *, int); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); diff --git a/fs/namei.c b/fs/namei.c index b987228e6afb..56b30af6a7f6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -415,6 +415,7 @@ int __inode_permission(struct inode *inode, int mask) return security_inode_permission(inode, mask); } +EXPORT_SYMBOL(__inode_permission); /** * sb_permission - Check superblock-level permissions diff --git a/include/linux/fs.h b/include/linux/fs.h index 5bcbbf3a0f00..82c78d6a2006 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2298,6 +2298,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); +extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) From 574c7ef9787b3b3d0513830e40323c34c2f53a39 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: [PATCH 132/236] vfs: introduce clone_private_mount() Overlayfs needs a private clone of the mount, so create a function for this and export to modules. Signed-off-by: Miklos Szeredi (cherry picked from commit c771d683a62e5d36bc46036f5c07f4f5bb7dda61) Signed-off-by: Alex Shi --- fs/namespace.c | 27 +++++++++++++++++++++++++++ include/linux/mount.h | 3 +++ 2 files changed, 30 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 039f3802d70e..c6533ce9cdee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1586,6 +1586,33 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { diff --git a/include/linux/mount.h b/include/linux/mount.h index b0c1e6574e7f..fff78cb99fa1 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -81,6 +81,9 @@ extern void mnt_pin(struct vfsmount *mnt); extern void mnt_unpin(struct vfsmount *mnt); extern int __mnt_is_readonly(struct vfsmount *mnt); +struct path; +extern struct vfsmount *clone_private_mount(struct path *path); + struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, From ae311653282de97e927ff8c961a79be3ab259cb1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: [PATCH 133/236] vfs: export check_sticky() It's already duplicated in btrfs and about to be used in overlayfs too. Move the sticky bit check to an inline helper and call the out-of-line helper only in the unlikly case of the sticky bit being set. Signed-off-by: Miklos Szeredi (cherry picked from commit cbdf35bcb833bfd00f0925d7a9a33a21f41ea582) Signed-off-by: Alex Shi --- fs/btrfs/ioctl.c | 20 +------------------- fs/namei.c | 9 ++------- include/linux/fs.h | 9 +++++++++ 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b72006aecbe..96b95e6f9f91 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -682,23 +682,6 @@ out: return ret; } -/* copy of check_sticky in fs/namei.c() -* It's inline, so penalty for filesystems that don't use sticky bit is -* minimal. -*/ -static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) -{ - kuid_t fsuid = current_fsuid(); - - if (!(dir->i_mode & S_ISVTX)) - return 0; - if (uid_eq(inode->i_uid, fsuid)) - return 0; - if (uid_eq(dir->i_uid, fsuid)) - return 0; - return !capable(CAP_FOWNER); -} - /* copy of may_delete in fs/namei.c() * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. @@ -734,8 +717,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (btrfs_check_sticky(dir, victim->d_inode)|| - IS_APPEND(victim->d_inode)|| + if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) return -EPERM; if (isdir) { diff --git a/fs/namei.c b/fs/namei.c index 56b30af6a7f6..1fd5d751b024 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2360,22 +2360,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path, } EXPORT_SYMBOL(kern_path_mountpoint); -/* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. - */ -static inline int check_sticky(struct inode *dir, struct inode *inode) +int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (!(dir->i_mode & S_ISVTX)) - return 0; if (uid_eq(inode->i_uid, fsuid)) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } +EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check diff --git a/include/linux/fs.h b/include/linux/fs.h index 82c78d6a2006..ddb9ab5d23f6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2300,6 +2300,7 @@ extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); +extern int __check_sticky(struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { @@ -2787,6 +2788,14 @@ static inline int is_sxid(umode_t mode) return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); } +static inline int check_sticky(struct inode *dir, struct inode *inode) +{ + if (!(dir->i_mode & S_ISVTX)) + return 0; + + return __check_sticky(dir, inode); +} + static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) From 9354e064adb1c963fdec1d38dc43d48dee394e1e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: [PATCH 134/236] vfs: add whiteout support Whiteout isn't actually a new file type, but is represented as a char device (Linus's idea) with 0/0 device number. This has several advantages compared to introducing a new whiteout file type: - no userspace API changes (e.g. trivial to make backups of upper layer filesystem, without losing whiteouts) - no fs image format changes (you can boot an old kernel/fsck without whiteout support and things won't break) - implementation is trivial Signed-off-by: Miklos Szeredi (cherry picked from commit 787fb6bc9682ec7c05fb5d9561b57100fbc1cc41) Signed-off-by: Alex Shi --- fs/namei.c | 14 ++++++++++++++ include/linux/fs.h | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 1fd5d751b024..9a4443b6e300 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4308,6 +4308,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + int error = may_create(dir, dentry); + if (error) + return error; + + if (!dir->i_op->mknod) + return -EPERM; + + return dir->i_op->mknod(dir, dentry, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} +EXPORT_SYMBOL(vfs_whiteout); + int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/include/linux/fs.h b/include/linux/fs.h index ddb9ab5d23f6..18492c801c39 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -217,6 +217,13 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) +/* + * Whiteout is represented by a char device. The following constants define the + * mode and device number to use. + */ +#define WHITEOUT_MODE 0 +#define WHITEOUT_DEV 0 + /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. @@ -1461,6 +1468,7 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); +extern int vfs_whiteout(struct inode *, struct dentry *); /* * VFS dentry helper functions. @@ -1687,6 +1695,9 @@ struct super_operations { #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) +#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ + (inode)->i_rdev == WHITEOUT_DEV) + /* * Inode state bits. Protected by inode->i_lock * From 0e98d78deaf317c6524acf9c57eecdefbfdbe444 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:37 +0200 Subject: [PATCH 135/236] vfs: add RENAME_WHITEOUT This adds a new RENAME_WHITEOUT flag. This flag makes rename() create a whiteout of source. The whiteout creation is atomic relative to the rename. Signed-off-by: Miklos Szeredi (cherry picked from commit 0d7a855526dd672e114aff2ac22b60fc6f155b08) Signed-off-by: Alex Shi --- fs/namei.c | 8 ++++++-- include/uapi/linux/fs.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9a4443b6e300..a047265e629e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4171,12 +4171,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, bool should_retry = false; int error; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) return -EINVAL; + if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) + return -EPERM; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index ca1a11bb4443..3735fa0a6784 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -37,6 +37,7 @@ #define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ struct fstrim_range { __u64 start; From c5341097e1a69ba7f5cbc938b927489ad3bef1ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Oct 2014 21:11:27 -0700 Subject: [PATCH 136/236] rcu: Provide counterpart to rcu_dereference() for non-RCU situations Although rcu_dereference() and friends can be used in situations where object lifetimes are being managed by something other than RCU, the resulting sparse and lockdep-RCU noise can be annoying. This commit therefore supplies a lockless_dereference(), which provides the protection for dereferences without the RCU-related debugging noise. Reported-by: Al Viro Signed-off-by: Paul E. McKenney Signed-off-by: Al Viro (cherry picked from commit 54ef6df3f3f1353d99c80c437259d317b2cd1cbd) Signed-off-by: Alex Shi --- include/linux/rcupdate.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 72bf3a01a4ee..a9a98ff6e339 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -554,6 +554,21 @@ static inline void rcu_preempt_sleep_check(void) */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) +/** + * lockless_dereference() - safely load a pointer for later dereference + * @p: The pointer to load + * + * Similar to rcu_dereference(), but for situations where the pointed-to + * object's lifetime is managed by something other than RCU. That + * "something other" might be reference counting or simple immortality. + */ +#define lockless_dereference(p) \ +({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ + (_________p1); \ +}) + /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to From 5587227cc0f683fe4732c0a6813f96154254e96b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:38 +0200 Subject: [PATCH 137/236] overlay filesystem Overlayfs allows one, usually read-write, directory tree to be overlaid onto another, read-only directory tree. All modifications go to the upper, writable layer. This type of mechanism is most often used for live CDs but there's a wide variety of other uses. The implementation differs from other "union filesystem" implementations in that after a file is opened all operations go directly to the underlying, lower or upper, filesystems. This simplifies the implementation and allows native performance in these cases. The dentry tree is duplicated from the underlying filesystems, this enables fast cached lookups without adding special support into the VFS. This uses slightly more memory than union mounts, but dentries are relatively small. Currently inodes are duplicated as well, but it is a possible optimization to share inodes for non-directories. Opening non directories results in the open forwarded to the underlying filesystem. This makes the behavior very similar to union mounts (with the same limitations vs. fchmod/fchown on O_RDONLY file descriptors). Usage: mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper/upper,workdir=/upper/work /overlay The following cotributions have been folded into this patch: Neil Brown : - minimal remount support - use correct seek function for directories - initialise is_real before use - rename ovl_fill_cache to ovl_dir_read Felix Fietkau : - fix a deadlock in ovl_dir_read_merged - fix a deadlock in ovl_remove_whiteouts Erez Zadok - fix cleanup after WARN_ON Sedat Dilek - fix up permission to confirm to new API Robin Dong - fix possible leak in ovl_new_inode - create new inode in ovl_link Andy Whitcroft - switch to __inode_permission() - copy up i_uid/i_gid from the underlying inode AV: - ovl_copy_up_locked() - dput(ERR_PTR(...)) on two failure exits - ovl_clear_empty() - one failure exit forgetting to do unlock_rename(), lack of check for udir being the parent of upper, dropping and regaining the lock on udir (which would require _another_ check for parent being right). - bogus d_drop() in copyup and rename [fix from your mail] - copyup/remove and copyup/rename races [fix from your mail] - ovl_dir_fsync() leaving ERR_PTR() in ->realfile - ovl_entry_free() is pointless - it's just a kfree_rcu() - fold ovl_do_lookup() into ovl_lookup() - manually assigning ->d_op is wrong. Just use ->s_d_op. [patches picked from Miklos]: * copyup/remove and copyup/rename races * bogus d_drop() in copyup and rename Also thanks to the following people for testing and reporting bugs: Jordi Pujol Andy Whitcroft Michal Suchanek Felix Fietkau Erez Zadok Randy Dunlap Signed-off-by: Miklos Szeredi (cherry picked from commit e9be9d5e76e34872f0c37d72e25bc27fe9e2c54c) Signed-off-by: Alex Shi --- fs/Kconfig | 1 + fs/Makefile | 1 + fs/overlayfs/Kconfig | 10 + fs/overlayfs/Makefile | 7 + fs/overlayfs/copy_up.c | 414 ++++++++++++++++++ fs/overlayfs/dir.c | 921 +++++++++++++++++++++++++++++++++++++++ fs/overlayfs/inode.c | 425 ++++++++++++++++++ fs/overlayfs/overlayfs.h | 191 ++++++++ fs/overlayfs/readdir.c | 587 +++++++++++++++++++++++++ fs/overlayfs/super.c | 727 ++++++++++++++++++++++++++++++ 10 files changed, 3284 insertions(+) create mode 100644 fs/overlayfs/Kconfig create mode 100644 fs/overlayfs/Makefile create mode 100644 fs/overlayfs/copy_up.c create mode 100644 fs/overlayfs/dir.c create mode 100644 fs/overlayfs/inode.c create mode 100644 fs/overlayfs/overlayfs.h create mode 100644 fs/overlayfs/readdir.c create mode 100644 fs/overlayfs/super.c diff --git a/fs/Kconfig b/fs/Kconfig index 7385e54be4b9..6b40fd8d719e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -67,6 +67,7 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" +source "fs/overlayfs/Kconfig" menu "Caches" diff --git a/fs/Makefile b/fs/Makefile index 47ac07bb4acc..5eb9caf136b4 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig new file mode 100644 index 000000000000..e60125976873 --- /dev/null +++ b/fs/overlayfs/Kconfig @@ -0,0 +1,10 @@ +config OVERLAYFS_FS + tristate "Overlay filesystem support" + help + An overlay filesystem combines two filesystems - an 'upper' filesystem + and a 'lower' filesystem. When a name exists in both filesystems, the + object in the 'upper' filesystem is visible while the object in the + 'lower' filesystem is either hidden or, in the case of directories, + merged with the 'upper' object. + + For more information see Documentation/filesystems/overlayfs.txt diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile new file mode 100644 index 000000000000..8f91889480d0 --- /dev/null +++ b/fs/overlayfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o + +overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c new file mode 100644 index 000000000000..ea10a8719107 --- /dev/null +++ b/fs/overlayfs/copy_up.c @@ -0,0 +1,414 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +int ovl_copy_xattr(struct dentry *old, struct dentry *new) +{ + ssize_t list_size, size; + char *buf, *name, *value; + int error; + + if (!old->d_inode->i_op->getxattr || + !new->d_inode->i_op->getxattr) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) { + if (list_size == -EOPNOTSUPP) + return 0; + return list_size; + } + + buf = kzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = -ENOMEM; + value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL); + if (!value) + goto out; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out_free_value; + } + + for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX); + if (size <= 0) { + error = size; + goto out_free_value; + } + error = vfs_setxattr(new, name, value, size, 0); + if (error) + goto out_free_value; + } + +out_free_value: + kfree(value); +out: + kfree(buf); + return error; +} + +static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +{ + struct file *old_file; + struct file *new_file; + loff_t old_pos = 0; + loff_t new_pos = 0; + int error = 0; + + if (len == 0) + return 0; + + old_file = ovl_path_open(old, O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + new_file = ovl_path_open(new, O_WRONLY); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out_fput; + } + + /* FIXME: copy up sparse files efficiently */ + while (len) { + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) { + error = -EINTR; + break; + } + + bytes = do_splice_direct(old_file, &old_pos, + new_file, &new_pos, + this_len, SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + WARN_ON(old_pos != new_pos); + + len -= bytes; + } + + fput(new_file); +out_fput: + fput(old_file); + return error; +} + +static char *ovl_read_symlink(struct dentry *realdentry) +{ + int res; + char *buf; + struct inode *inode = realdentry->d_inode; + mm_segment_t old_fs; + + res = -EINVAL; + if (!inode->i_op->readlink) + goto err; + + res = -ENOMEM; + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + goto err; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = inode->i_op->readlink(realdentry, + (char __user *)buf, PAGE_SIZE - 1); + set_fs(old_fs); + if (res < 0) { + free_page((unsigned long) buf); + goto err; + } + buf[res] = '\0'; + + return buf; + +err: + return ERR_PTR(res); +} + +static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return notify_change(upperdentry, &attr, NULL); +} + +int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) +{ + int err = 0; + + if (!S_ISLNK(stat->mode)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat->mode, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) { + struct iattr attr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = stat->uid, + .ia_gid = stat->gid, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) + ovl_set_timestamps(upperdentry, stat); + + return err; + +} + +static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, + struct dentry *dentry, struct path *lowerpath, + struct kstat *stat, struct iattr *attr, + const char *link) +{ + struct inode *wdir = workdir->d_inode; + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry = NULL; + struct dentry *upper = NULL; + umode_t mode = stat->mode; + int err; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out1; + + /* Can't properly set mode on creation because of the umask */ + stat->mode &= S_IFMT; + err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); + stat->mode = mode; + if (err) + goto out2; + + if (S_ISREG(stat->mode)) { + struct path upperpath; + ovl_path_upper(dentry, &upperpath); + BUG_ON(upperpath.dentry != NULL); + upperpath.dentry = newdentry; + + err = ovl_copy_up_data(lowerpath, &upperpath, stat->size); + if (err) + goto out_cleanup; + } + + err = ovl_copy_xattr(lowerpath->dentry, newdentry); + if (err) + goto out_cleanup; + + mutex_lock(&newdentry->d_inode->i_mutex); + err = ovl_set_attr(newdentry, stat); + if (!err && attr) + err = notify_change(newdentry, attr, NULL); + mutex_unlock(&newdentry->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + + ovl_dentry_update(dentry, newdentry); + newdentry = NULL; + + /* + * Non-directores become opaque when copied up. + */ + if (!S_ISDIR(stat->mode)) + ovl_dentry_set_opaque(dentry, true); +out2: + dput(upper); +out1: + dput(newdentry); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out; +} + +/* + * Copy up a single dentry + * + * Directory renames only allowed on "pure upper" (already created on + * upper filesystem, never copied up). Directories which are on lower or + * are merged may not be renamed. For these -EXDEV is returned and + * userspace has to deal with it. This means, when copying up a + * directory we can rely on it and ancestors being stable. + * + * Non-directory renames start with copy up of source if necessary. The + * actual rename will only proceed once the copy up was successful. Copy + * up uses upper parent i_mutex for exclusion. Since rename can change + * d_parent it is possible that the copy up will lock the old parent. At + * that point the file will have already been copied up anyway. + */ +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr) +{ + struct dentry *workdir = ovl_workdir(dentry); + int err; + struct kstat pstat; + struct path parentpath; + struct dentry *upperdir; + struct dentry *upperdentry; + const struct cred *old_cred; + struct cred *override_cred; + char *link = NULL; + + ovl_path_upper(parent, &parentpath); + upperdir = parentpath.dentry; + + err = vfs_getattr(&parentpath, &pstat); + if (err) + return err; + + if (S_ISLNK(stat->mode)) { + link = ovl_read_symlink(lowerpath->dentry); + if (IS_ERR(link)) + return PTR_ERR(link); + } + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_free_link; + + override_cred->fsuid = stat->uid; + override_cred->fsgid = stat->gid; + /* + * CAP_SYS_ADMIN for copying up extended attributes + * CAP_DAC_OVERRIDE for create + * CAP_FOWNER for chmod, timestamp update + * CAP_FSETID for chmod + * CAP_CHOWN for chown + * CAP_MKNOD for mknod + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + cap_raise(override_cred->cap_effective, CAP_MKNOD); + old_cred = override_creds(override_cred); + + err = -EIO; + if (lock_rename(workdir, upperdir) != NULL) { + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + goto out_unlock; + } + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + unlock_rename(workdir, upperdir); + err = 0; + /* Raced with another copy-up? Do the setattr here */ + if (attr) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } + goto out_put_cred; + } + + err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath, + stat, attr, link); + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, &pstat); + } +out_unlock: + unlock_rename(workdir, upperdir); +out_put_cred: + revert_creds(old_cred); + put_cred(override_cred); + +out_free_link: + if (link) + free_page((unsigned long) link); + + return err; +} + +int ovl_copy_up(struct dentry *dentry) +{ + int err; + + err = 0; + while (!err) { + struct dentry *next; + struct dentry *parent; + struct path lowerpath; + struct kstat stat; + enum ovl_path_type type = ovl_path_type(dentry); + + if (type != OVL_PATH_LOWER) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (;;) { + parent = dget_parent(next); + + type = ovl_path_type(parent); + if (type != OVL_PATH_LOWER) + break; + + dput(next); + next = parent; + } + + ovl_path_lower(next, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (!err) + err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL); + + dput(parent); + dput(next); + } + + return err; +} diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c new file mode 100644 index 000000000000..15cd91ad9940 --- /dev/null +++ b/fs/overlayfs/dir.c @@ -0,0 +1,921 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include "overlayfs.h" + +void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) +{ + int err; + + dget(wdentry); + if (S_ISDIR(wdentry->d_inode->i_mode)) + err = ovl_do_rmdir(wdir, wdentry); + else + err = ovl_do_unlink(wdir, wdentry); + dput(wdentry); + + if (err) { + pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + wdentry, err); + } +} + +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) +{ + struct dentry *temp; + char name[20]; + + snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + + temp = lookup_one_len(name, workdir, strlen(name)); + if (!IS_ERR(temp) && temp->d_inode) { + pr_err("overlayfs: workdir/%s already exists\n", name); + dput(temp); + temp = ERR_PTR(-EIO); + } + + return temp; +} + +/* caller holds i_mutex on workdir */ +static struct dentry *ovl_whiteout(struct dentry *workdir, + struct dentry *dentry) +{ + int err; + struct dentry *whiteout; + struct inode *wdir = workdir->d_inode; + + whiteout = ovl_lookup_temp(workdir, dentry); + if (IS_ERR(whiteout)) + return whiteout; + + err = ovl_do_whiteout(wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + } + + return whiteout; +} + +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug) +{ + int err; + + if (newdentry->d_inode) + return -ESTALE; + + if (hardlink) { + err = ovl_do_link(hardlink, dir, newdentry, debug); + } else { + switch (stat->mode & S_IFMT) { + case S_IFREG: + err = ovl_do_create(dir, newdentry, stat->mode, debug); + break; + + case S_IFDIR: + err = ovl_do_mkdir(dir, newdentry, stat->mode, debug); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ovl_do_mknod(dir, newdentry, + stat->mode, stat->rdev, debug); + break; + + case S_IFLNK: + err = ovl_do_symlink(dir, newdentry, link, debug); + break; + + default: + err = -EPERM; + } + } + if (!err && WARN_ON(!newdentry->d_inode)) { + /* + * Not quite sure if non-instantiated dentry is legal or not. + * VFS doesn't seem to care so check and warn here. + */ + err = -ENOENT; + } + return err; +} + +static int ovl_set_opaque(struct dentry *upperdentry) +{ + return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); +} + +static void ovl_remove_opaque(struct dentry *upperdentry) +{ + int err; + + err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr); + if (err) { + pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", + upperdentry->d_name.name, err); + } +} + +static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + enum ovl_path_type type; + struct path realpath; + + type = ovl_path_real(dentry, &realpath); + err = vfs_getattr(&realpath, stat); + if (err) + return err; + + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (type == OVL_PATH_MERGE) + stat->nlink = 1; + + return 0; +} + +static int ovl_create_upper(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry; + int err; + + mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + newdentry = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + err = ovl_create_real(udir, newdentry, stat, link, hardlink, false); + if (err) + goto out_dput; + + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput: + dput(newdentry); +out_unlock: + mutex_unlock(&udir->i_mutex); + return err; +} + +static int ovl_lock_rename_workdir(struct dentry *workdir, + struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} + +static struct dentry *ovl_clear_empty(struct dentry *dentry, + struct list_head *list) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct path upperpath; + struct dentry *upper; + struct dentry *opaquedir; + struct kstat stat; + int err; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + ovl_path_upper(dentry, &upperpath); + err = vfs_getattr(&upperpath, &stat); + if (err) + goto out_unlock; + + err = -ESTALE; + if (!S_ISDIR(stat.mode)) + goto out_unlock; + upper = upperpath.dentry; + if (upper->d_parent->d_inode != udir) + goto out_unlock; + + opaquedir = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out_unlock; + + err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true); + if (err) + goto out_dput; + + err = ovl_copy_xattr(upper, opaquedir); + if (err) + goto out_cleanup; + + err = ovl_set_opaque(opaquedir); + if (err) + goto out_cleanup; + + mutex_lock(&opaquedir->d_inode->i_mutex); + err = ovl_set_attr(opaquedir, &stat); + mutex_unlock(&opaquedir->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup_whiteouts(upper, list); + ovl_cleanup(wdir, upper); + unlock_rename(workdir, upperdir); + + /* dentry's upper doesn't match now, get rid of it */ + d_drop(dentry); + + return opaquedir; + +out_cleanup: + ovl_cleanup(wdir, opaquedir); +out_dput: + dput(opaquedir); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return ERR_PTR(err); +} + +static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, + enum ovl_path_type type) +{ + int err; + struct dentry *ret = NULL; + LIST_HEAD(list); + + err = ovl_check_empty_dir(dentry, &list); + if (err) + ret = ERR_PTR(err); + else if (type == OVL_PATH_MERGE) + ret = ovl_clear_empty(dentry, &list); + + ovl_cache_free(&list); + + return ret; +} + +static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *upper; + struct dentry *newdentry; + int err; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_dput; + + err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true); + if (err) + goto out_dput2; + + if (S_ISDIR(stat->mode)) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, + RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup(wdir, upper); + } else { + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + } + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput2: + dput(upper); +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out_dput2; +} + +static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, + const char *link, struct dentry *hardlink) +{ + int err; + struct inode *inode; + struct kstat stat = { + .mode = mode, + .rdev = rdev, + }; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata); + if (!inode) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_iput; + + if (!ovl_dentry_is_opaque(dentry)) { + err = ovl_create_upper(dentry, inode, &stat, link, hardlink); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_iput; + + /* + * CAP_SYS_ADMIN for setting opaque xattr + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + old_cred = override_creds(override_cred); + + err = ovl_create_over_whiteout(dentry, inode, &stat, link, + hardlink); + + revert_creds(old_cred); + put_cred(override_cred); + } + + if (!err) + inode = NULL; +out_iput: + iput(inode); +out: + return err; +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_create_or_link(dentry, mode, rdev, link, NULL); + ovl_drop_write(dentry); + } + + return err; +} + +static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + /* Don't allow creation of "whiteout" on overlay */ + if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) + return -EPERM; + + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct inode *dir, struct dentry *dentry, + const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + struct dentry *upper; + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + upper = ovl_dentry_upper(old); + err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper); + +out_drop_write: + ovl_drop_write(old); +out: + return err; +} + +static int ovl_remove_and_whiteout(struct dentry *dentry, + enum ovl_path_type type, bool is_dir) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *whiteout; + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (is_dir) { + opaquedir = ovl_check_empty_and_clear(dentry, type); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out_dput; + + whiteout = ovl_whiteout(workdir, dentry); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto out_unlock; + + if (type == OVL_PATH_LOWER) { + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto kill_whiteout; + + err = ovl_do_rename(wdir, whiteout, udir, upper, 0); + dput(upper); + if (err) + goto kill_whiteout; + } else { + int flags = 0; + + upper = ovl_dentry_upper(dentry); + if (opaquedir) + upper = opaquedir; + err = -ESTALE; + if (upper->d_parent != upperdir) + goto kill_whiteout; + + if (is_dir) + flags |= RENAME_EXCHANGE; + + err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + if (err) + goto kill_whiteout; + + if (is_dir) + ovl_cleanup(wdir, upper); + } + ovl_dentry_version_inc(dentry->d_parent); +out_d_drop: + d_drop(dentry); + dput(whiteout); +out_unlock: + unlock_rename(workdir, upperdir); +out_dput: + dput(opaquedir); +out: + return err; + +kill_whiteout: + ovl_cleanup(wdir, whiteout); + goto out_d_drop; +} + +static int ovl_remove_upper(struct dentry *dentry, bool is_dir) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *dir = upperdir->d_inode; + struct dentry *upper = ovl_dentry_upper(dentry); + int err; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + err = -ESTALE; + if (upper->d_parent == upperdir) { + /* Don't let d_delete() think it can reset d_inode */ + dget(upper); + if (is_dir) + err = vfs_rmdir(dir, upper); + else + err = vfs_unlink(dir, upper, NULL); + dput(upper); + ovl_dentry_version_inc(dentry->d_parent); + } + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ + d_drop(dentry); + mutex_unlock(&dir->i_mutex); + + return err; +} + +static inline int ovl_check_sticky(struct dentry *dentry) +{ + struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode; + struct inode *inode = ovl_dentry_real(dentry)->d_inode; + + if (check_sticky(dir, inode)) + return -EPERM; + + return 0; +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + enum ovl_path_type type; + int err; + + err = ovl_check_sticky(dentry); + if (err) + goto out; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_drop_write; + + type = ovl_path_type(dentry); + if (type == OVL_PATH_PURE_UPPER) { + err = ovl_remove_upper(dentry, is_dir); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + + err = ovl_remove_and_whiteout(dentry, type, is_dir); + + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, true); +} + +static int ovl_rename2(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new, + unsigned int flags) +{ + int err; + enum ovl_path_type old_type; + enum ovl_path_type new_type; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool old_opaque; + bool new_opaque; + bool new_create = false; + bool cleanup_whiteout = false; + bool overwrite = !(flags & RENAME_EXCHANGE); + bool is_dir = S_ISDIR(old->d_inode->i_mode); + bool new_is_dir = false; + struct dentry *opaquedir = NULL; + const struct cred *old_cred = NULL; + struct cred *override_cred = NULL; + + err = -EINVAL; + if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + goto out; + + flags &= ~RENAME_NOREPLACE; + + err = ovl_check_sticky(old); + if (err) + goto out; + + /* Don't copy up directory trees */ + old_type = ovl_path_type(old); + err = -EXDEV; + if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir) + goto out; + + if (new->d_inode) { + err = ovl_check_sticky(new); + if (err) + goto out; + + if (S_ISDIR(new->d_inode->i_mode)) + new_is_dir = true; + + new_type = ovl_path_type(new); + err = -EXDEV; + if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) + goto out; + + err = 0; + if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { + if (ovl_dentry_lower(old)->d_inode == + ovl_dentry_lower(new)->d_inode) + goto out; + } + if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) { + if (ovl_dentry_upper(old)->d_inode == + ovl_dentry_upper(new)->d_inode) + goto out; + } + } else { + if (ovl_dentry_is_opaque(new)) + new_type = OVL_PATH_UPPER; + else + new_type = OVL_PATH_PURE_UPPER; + } + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (!overwrite) { + err = ovl_copy_up(new); + if (err) + goto out_drop_write; + } + + old_opaque = old_type != OVL_PATH_PURE_UPPER; + new_opaque = new_type != OVL_PATH_PURE_UPPER; + + if (old_opaque || new_opaque) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + } + + if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { + opaquedir = ovl_check_empty_and_clear(new, new_type); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) { + opaquedir = NULL; + goto out_revert_creds; + } + } + + if (overwrite) { + if (old_opaque) { + if (new->d_inode || !new_opaque) { + /* Whiteout source */ + flags |= RENAME_WHITEOUT; + } else { + /* Switch whiteouts */ + flags |= RENAME_EXCHANGE; + } + } else if (is_dir && !new->d_inode && new_opaque) { + flags |= RENAME_EXCHANGE; + cleanup_whiteout = true; + } + } + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = ovl_dentry_upper(old); + newdentry = ovl_dentry_upper(new); + if (newdentry) { + if (opaquedir) { + newdentry = opaquedir; + opaquedir = NULL; + } else { + dget(newdentry); + } + } else { + new_create = true; + newdentry = lookup_one_len(new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + } + + err = -ESTALE; + if (olddentry->d_parent != old_upperdir) + goto out_dput; + if (newdentry->d_parent != new_upperdir) + goto out_dput; + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + if (is_dir && !old_opaque && new_opaque) { + err = ovl_set_opaque(olddentry); + if (err) + goto out_dput; + } + if (!overwrite && new_is_dir && old_opaque && !new_opaque) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_dput; + } + + if (old_opaque || new_opaque) { + err = ovl_do_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + flags); + } else { + /* No debug for the plain case */ + BUG_ON(flags & ~RENAME_EXCHANGE); + err = vfs_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + NULL, flags); + } + + if (err) { + if (is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(newdentry); + goto out_dput; + } + + if (is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(newdentry); + + if (old_opaque != new_opaque) { + ovl_dentry_set_opaque(old, new_opaque); + if (!overwrite) + ovl_dentry_set_opaque(new, old_opaque); + } + + if (cleanup_whiteout) + ovl_cleanup(old_upperdir->d_inode, newdentry); + + ovl_dentry_version_inc(old->d_parent); + ovl_dentry_version_inc(new->d_parent); + +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); +out_revert_creds: + if (old_opaque || new_opaque) { + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(old); +out: + dput(opaquedir); + return err; +} + +const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename2 = ovl_rename2, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_dir_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c new file mode 100644 index 000000000000..af2d18c9fcee --- /dev/null +++ b/fs/overlayfs/inode.c @@ -0,0 +1,425 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include "overlayfs.h" + +static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr, + bool no_data) +{ + int err; + struct dentry *parent; + struct kstat stat; + struct path lowerpath; + + parent = dget_parent(dentry); + err = ovl_copy_up(parent); + if (err) + goto out_dput_parent; + + ovl_path_lower(dentry, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (err) + goto out_dput_parent; + + if (no_data) + stat.size = 0; + + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr); + +out_dput_parent: + dput(parent); + return err; +} + +int ovl_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } else { + err = ovl_copy_up_last(dentry, attr, false); + } + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct path realpath; + + ovl_path_real(dentry, &realpath); + return vfs_getattr(&realpath, stat); +} + +int ovl_permission(struct inode *inode, int mask) +{ + struct ovl_entry *oe; + struct dentry *alias = NULL; + struct inode *realinode; + struct dentry *realdentry; + bool is_upper; + int err; + + if (S_ISDIR(inode->i_mode)) { + oe = inode->i_private; + } else if (mask & MAY_NOT_BLOCK) { + return -ECHILD; + } else { + /* + * For non-directories find an alias and get the info + * from there. + */ + alias = d_find_any_alias(inode); + if (WARN_ON(!alias)) + return -ENOENT; + + oe = alias->d_fsdata; + } + + realdentry = ovl_entry_real(oe, &is_upper); + + /* Careful in RCU walk mode */ + realinode = ACCESS_ONCE(realdentry->d_inode); + if (!realinode) { + WARN_ON(!(mask & MAY_NOT_BLOCK)); + err = -ENOENT; + goto out_dput; + } + + if (mask & MAY_WRITE) { + umode_t mode = realinode->i_mode; + + /* + * Writes will always be redirected to upper layer, so + * ignore lower layer being read-only. + * + * If the overlay itself is read-only then proceed + * with the permission check, don't return EROFS. + * This will only happen if this is the lower layer of + * another overlayfs. + * + * If upper fs becomes read-only after the overlay was + * constructed return EROFS to prevent modification of + * upper layer. + */ + err = -EROFS; + if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + goto out_dput; + } + + err = __inode_permission(realinode, mask); +out_dput: + dput(alias); + return err; +} + + +struct ovl_link_data { + struct dentry *realdentry; + void *cookie; +}; + +static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + void *ret; + struct dentry *realdentry; + struct inode *realinode; + + realdentry = ovl_dentry_real(dentry); + realinode = realdentry->d_inode; + + if (WARN_ON(!realinode->i_op->follow_link)) + return ERR_PTR(-EPERM); + + ret = realinode->i_op->follow_link(realdentry, nd); + if (IS_ERR(ret)) + return ret; + + if (realinode->i_op->put_link) { + struct ovl_link_data *data; + + data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); + if (!data) { + realinode->i_op->put_link(realdentry, nd, ret); + return ERR_PTR(-ENOMEM); + } + data->realdentry = realdentry; + data->cookie = ret; + + return data; + } else { + return NULL; + } +} + +static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + struct inode *realinode; + struct ovl_link_data *data = c; + + if (!data) + return; + + realinode = data->realdentry->d_inode; + realinode->i_op->put_link(data->realdentry, nd, data->cookie); + kfree(data); +} + +static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + struct path realpath; + struct inode *realinode; + + ovl_path_real(dentry, &realpath); + realinode = realpath.dentry->d_inode; + + if (!realinode->i_op->readlink) + return -EINVAL; + + touch_atime(&realpath); + + return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); +} + + +static bool ovl_is_private_xattr(const char *name) +{ + return strncmp(name, "trusted.overlay.", 14) == 0; +} + +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = -EPERM; + if (ovl_is_private_xattr(name)) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + upperdentry = ovl_dentry_upper(dentry); + err = vfs_setxattr(upperdentry, name, value, size, flags); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + return -ENODATA; + + return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + ssize_t res; + int off; + + res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + if (res <= 0 || size == 0) + return res; + + if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + return res; + + /* filter out private xattrs */ + for (off = 0; off < res;) { + char *s = list + off; + size_t slen = strlen(s) + 1; + + BUG_ON(off + slen > res); + + if (ovl_is_private_xattr(s)) { + res -= slen; + memmove(s, s + slen, res - off); + } else { + off += slen; + } + } + + return res; +} + +int ovl_removexattr(struct dentry *dentry, const char *name) +{ + int err; + struct path realpath; + enum ovl_path_type type; + + err = ovl_want_write(dentry); + if (err) + goto out; + + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + goto out_drop_write; + + type = ovl_path_real(dentry, &realpath); + if (type == OVL_PATH_LOWER) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_removexattr(realpath.dentry, name); +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, + struct dentry *realdentry) +{ + if (type != OVL_PATH_LOWER) + return false; + + if (special_file(realdentry->d_inode->i_mode)) + return false; + + if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC)) + return false; + + return true; +} + +static int ovl_dentry_open(struct dentry *dentry, struct file *file, + const struct cred *cred) +{ + int err; + struct path realpath; + enum ovl_path_type type; + bool want_write = false; + + type = ovl_path_real(dentry, &realpath); + if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { + want_write = true; + err = ovl_want_write(dentry); + if (err) + goto out; + + if (file->f_flags & O_TRUNC) + err = ovl_copy_up_last(dentry, NULL, true); + else + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_open(&realpath, file, cred); +out_drop_write: + if (want_write) + ovl_drop_write(dentry); +out: + return err; +} + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, + .dentry_open = ovl_dentry_open, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .follow_link = ovl_follow_link, + .put_link = ovl_put_link, + .readlink = ovl_readlink, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return NULL; + + mode &= S_IFMT; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_flags |= S_NOATIME | S_NOCMTIME; + + switch (mode) { + case S_IFDIR: + inode->i_private = oe; + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + break; + + case S_IFLNK: + inode->i_op = &ovl_symlink_inode_operations; + break; + + case S_IFREG: + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + inode->i_op = &ovl_file_inode_operations; + break; + + default: + WARN(1, "illegal file type: %i\n", mode); + iput(inode); + inode = NULL; + } + + return inode; + +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h new file mode 100644 index 000000000000..814bed33dd07 --- /dev/null +++ b/fs/overlayfs/overlayfs.h @@ -0,0 +1,191 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include + +struct ovl_entry; + +enum ovl_path_type { + OVL_PATH_PURE_UPPER, + OVL_PATH_UPPER, + OVL_PATH_MERGE, + OVL_PATH_LOWER, +}; + +extern const char *ovl_opaque_xattr; + +static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_rmdir(dir, dentry); + pr_debug("rmdir(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_unlink(dir, dentry, NULL); + pr_debug("unlink(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool debug) +{ + int err = vfs_link(old_dentry, dir, new_dentry, NULL); + if (debug) { + pr_debug("link(%pd2, %pd2) = %i\n", + old_dentry, new_dentry, err); + } + return err; +} + +static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_create(dir, dentry, mode, true); + if (debug) + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_mkdir(dir, dentry, mode); + if (debug) + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev, bool debug) +{ + int err = vfs_mknod(dir, dentry, mode, dev); + if (debug) { + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", + dentry, mode, dev, err); + } + return err; +} + +static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname, bool debug) +{ + int err = vfs_symlink(dir, dentry, oldname); + if (debug) + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + return err; +} + +static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err = vfs_setxattr(dentry, name, value, size, flags); + pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n", + dentry, name, (int) size, (char *) value, flags, err); + return err; +} + +static inline int ovl_do_removexattr(struct dentry *dentry, const char *name) +{ + int err = vfs_removexattr(dentry, name); + pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); + return err; +} + +static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, + struct inode *newdir, struct dentry *newdentry, + unsigned int flags) +{ + int err; + + pr_debug("rename2(%pd2, %pd2, 0x%x)\n", + olddentry, newdentry, flags); + + err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); + + if (err) { + pr_debug("...rename2(%pd2, %pd2, ...) = %i\n", + olddentry, newdentry, err); + } + return err; +} + +static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_whiteout(dir, dentry); + pr_debug("whiteout(%pd2) = %i\n", dentry, err); + return err; +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry); +u64 ovl_dentry_version_get(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_path_upper(struct dentry *dentry, struct path *path); +void ovl_path_lower(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +struct dentry *ovl_dentry_upper(struct dentry *dentry); +struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); +struct dentry *ovl_workdir(struct dentry *dentry); +int ovl_want_write(struct dentry *dentry); +void ovl_drop_write(struct dentry *dentry); +bool ovl_dentry_is_opaque(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); +bool ovl_is_whiteout(struct dentry *dentry); +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +struct file *ovl_path_open(struct path *path, int flags); + +struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, + struct kstat *stat, const char *link); + +/* readdir.c */ +extern const struct file_operations ovl_dir_operations; +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); +void ovl_cache_free(struct list_head *list); + +/* inode.c */ +int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_permission(struct inode *inode, int mask); +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); +int ovl_removexattr(struct dentry *dentry, const char *name); + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe); +static inline void ovl_copyattr(struct inode *from, struct inode *to) +{ + to->i_uid = from->i_uid; + to->i_gid = from->i_gid; +} + +/* dir.c */ +extern const struct inode_operations ovl_dir_inode_operations; +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug); +void ovl_cleanup(struct inode *dir, struct dentry *dentry); + +/* copy_up.c */ +int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr); +int ovl_copy_xattr(struct dentry *old, struct dentry *new); +int ovl_set_attr(struct dentry *upper, struct kstat *stat); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c new file mode 100644 index 000000000000..c6787f84ece9 --- /dev/null +++ b/fs/overlayfs/readdir.c @@ -0,0 +1,587 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +struct ovl_cache_entry { + const char *name; + unsigned int len; + unsigned int type; + u64 ino; + bool is_whiteout; + struct list_head l_node; + struct rb_node node; +}; + +struct ovl_dir_cache { + long refcount; + u64 version; + struct list_head entries; +}; + +struct ovl_readdir_data { + struct dir_context ctx; + bool is_merge; + struct rb_root *root; + struct list_head *list; + struct list_head *middle; + int count; + int err; +}; + +struct ovl_dir_file { + bool is_real; + bool is_upper; + struct ovl_dir_cache *cache; + struct ovl_cache_entry cursor; + struct file *realfile; + struct file *upperfile; +}; + +static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) +{ + return container_of(n, struct ovl_cache_entry, node); +} + +static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, + const char *name, int len) +{ + struct rb_node *node = root->rb_node; + int cmp; + + while (node) { + struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); + + cmp = strncmp(name, p->name, len); + if (cmp > 0) + node = p->node.rb_right; + else if (cmp < 0 || len < p->len) + node = p->node.rb_left; + else + return p; + } + + return NULL; +} + +static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, + u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL); + if (p) { + char *name_copy = (char *) (p + 1); + memcpy(name_copy, name, len); + name_copy[len] = '\0'; + p->name = name_copy; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + } + + return p; +} + +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, u64 ino, + unsigned int d_type) +{ + struct rb_node **newp = &rdd->root->rb_node; + struct rb_node *parent = NULL; + struct ovl_cache_entry *p; + + while (*newp) { + int cmp; + struct ovl_cache_entry *tmp; + + parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + return 0; + } + + p = ovl_cache_entry_new(name, len, ino, d_type); + if (p == NULL) + return -ENOMEM; + + list_add_tail(&p->l_node, rdd->list); + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, rdd->root); + + return 0; +} + +static int ovl_fill_lower(struct ovl_readdir_data *rdd, + const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(rdd->root, name, namelen); + if (p) { + list_move_tail(&p->l_node, rdd->middle); + } else { + p = ovl_cache_entry_new(name, namelen, ino, d_type); + if (p == NULL) + rdd->err = -ENOMEM; + else + list_add_tail(&p->l_node, rdd->middle); + } + + return rdd->err; +} + +void ovl_cache_free(struct list_head *list) +{ + struct ovl_cache_entry *p; + struct ovl_cache_entry *n; + + list_for_each_entry_safe(p, n, list, l_node) + kfree(p); + + INIT_LIST_HEAD(list); +} + +static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +{ + struct ovl_dir_cache *cache = od->cache; + + list_del(&od->cursor.l_node); + WARN_ON(cache->refcount <= 0); + cache->refcount--; + if (!cache->refcount) { + if (ovl_dir_cache(dentry) == cache) + ovl_set_dir_cache(dentry, NULL); + + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static int ovl_fill_merge(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + + rdd->count++; + if (!rdd->is_merge) + return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + else + return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); +} + +static inline int ovl_dir_read(struct path *realpath, + struct ovl_readdir_data *rdd) +{ + struct file *realfile; + int err; + + realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + rdd->ctx.pos = 0; + do { + rdd->count = 0; + rdd->err = 0; + err = iterate_dir(realfile, &rdd->ctx); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + fput(realfile); + + return err; +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct ovl_dir_cache *cache = od->cache; + struct dentry *dentry = file->f_path.dentry; + enum ovl_path_type type = ovl_path_type(dentry); + + if (cache && ovl_dentry_version_get(dentry) != cache->version) { + ovl_cache_put(od, dentry); + od->cache = NULL; + } + WARN_ON(!od->is_real && type != OVL_PATH_MERGE); + if (od->is_real && type == OVL_PATH_MERGE) + od->is_real = false; +} + +static int ovl_dir_mark_whiteouts(struct dentry *dir, + struct ovl_readdir_data *rdd) +{ + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + ovl_cache_free(rdd->list); + return -ENOMEM; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + mutex_lock(&dir->d_inode->i_mutex); + list_for_each_entry(p, rdd->list, l_node) { + if (!p->name) + continue; + + if (p->type != DT_CHR) + continue; + + dentry = lookup_one_len(p->name, dir, p->len); + if (IS_ERR(dentry)) + continue; + + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + mutex_unlock(&dir->d_inode->i_mutex); + + revert_creds(old_cred); + put_cred(override_cred); + + return 0; +} + +static inline int ovl_dir_read_merged(struct path *upperpath, + struct path *lowerpath, + struct list_head *list) +{ + int err; + struct rb_root root = RB_ROOT; + struct list_head middle; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .list = list, + .root = &root, + .is_merge = false, + }; + + if (upperpath->dentry) { + err = ovl_dir_read(upperpath, &rdd); + if (err) + goto out; + + if (lowerpath->dentry) { + err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd); + if (err) + goto out; + } + } + if (lowerpath->dentry) { + /* + * Insert lowerpath entries before upperpath ones, this allows + * offsets to be reasonably constant + */ + list_add(&middle, rdd.list); + rdd.middle = &middle; + rdd.is_merge = true; + err = ovl_dir_read(lowerpath, &rdd); + list_del(&middle); + } +out: + return err; + +} + +static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) +{ + struct ovl_cache_entry *p; + loff_t off = 0; + + list_for_each_entry(p, &od->cache->entries, l_node) { + if (!p->name) + continue; + if (off >= pos) + break; + off++; + } + list_move_tail(&od->cursor.l_node, &p->l_node); +} + +static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) +{ + int res; + struct path lowerpath; + struct path upperpath; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(dentry); + if (cache && ovl_dentry_version_get(dentry) == cache->version) { + cache->refcount++; + return cache; + } + ovl_set_dir_cache(dentry, NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + cache->refcount = 1; + INIT_LIST_HEAD(&cache->entries); + + ovl_path_lower(dentry, &lowerpath); + ovl_path_upper(dentry, &upperpath); + + res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(dentry, cache); + + return cache; +} + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + + if (!ctx->pos) + ovl_dir_reset(file); + + if (od->is_real) + return iterate_dir(od->realfile, ctx); + + if (!od->cache) { + struct ovl_dir_cache *cache; + + cache = ovl_cache_get(dentry); + if (IS_ERR(cache)) + return PTR_ERR(cache); + + od->cache = cache; + ovl_seek_cursor(od, ctx->pos); + } + + while (od->cursor.l_node.next != &od->cache->entries) { + struct ovl_cache_entry *p; + + p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node); + /* Skip cursors */ + if (p->name) { + if (!p->is_whiteout) { + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + } + ctx->pos++; + } + list_move(&od->cursor.l_node, &p->l_node); + } + return 0; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + mutex_lock(&file_inode(file)->i_mutex); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else { + res = -EINVAL; + + switch (origin) { + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_SET: + break; + default: + goto out_unlock; + } + if (offset < 0) + goto out_unlock; + + if (offset != file->f_pos) { + file->f_pos = offset; + if (od->cache) + ovl_seek_cursor(od, offset); + } + res = offset; + } +out_unlock: + mutex_unlock(&file_inode(file)->i_mutex); + + return res; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct file *realfile = od->realfile; + + /* + * Need to check if we started out being a lower dir, but got copied up + */ + if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { + struct inode *inode = file_inode(file); + + mutex_lock(&inode->i_mutex); + realfile = od->upperfile; + if (!realfile) { + struct path upperpath; + + ovl_path_upper(dentry, &upperpath); + realfile = ovl_path_open(&upperpath, O_RDONLY); + if (IS_ERR(realfile)) { + mutex_unlock(&inode->i_mutex); + return PTR_ERR(realfile); + } + od->upperfile = realfile; + } + mutex_unlock(&inode->i_mutex); + } + + return vfs_fsync_range(realfile, start, end, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + if (od->cache) { + mutex_lock(&inode->i_mutex); + ovl_cache_put(od, file->f_path.dentry); + mutex_unlock(&inode->i_mutex); + } + fput(od->realfile); + if (od->upperfile) + fput(od->upperfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = ovl_path_open(&realpath, file->f_flags); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + INIT_LIST_HEAD(&od->cursor.l_node); + od->realfile = realfile; + od->is_real = (type != OVL_PATH_MERGE); + od->is_upper = (type != OVL_PATH_LOWER); + file->private_data = od; + + return 0; +} + +const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .iterate = ovl_iterate, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) +{ + int err; + struct path lowerpath; + struct path upperpath; + struct ovl_cache_entry *p; + + ovl_path_upper(dentry, &upperpath); + ovl_path_lower(dentry, &lowerpath); + + err = ovl_dir_read_merged(&upperpath, &lowerpath, list); + if (err) + return err; + + err = 0; + + list_for_each_entry(p, list, l_node) { + if (p->is_whiteout) + continue; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + err = -ENOTEMPTY; + break; + } + + return err; +} + +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) +{ + struct ovl_cache_entry *p; + + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT); + list_for_each_entry(p, list, l_node) { + struct dentry *dentry; + + if (!p->is_whiteout) + continue; + + dentry = lookup_one_len(p->name, upper, p->len); + if (IS_ERR(dentry)) { + pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + upper->d_name.name, p->len, p->name, + (int) PTR_ERR(dentry)); + continue; + } + ovl_cleanup(upper->d_inode, dentry); + dput(dentry); + } + mutex_unlock(&upper->d_inode->i_mutex); +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c new file mode 100644 index 000000000000..227710aad781 --- /dev/null +++ b/fs/overlayfs/super.c @@ -0,0 +1,727 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +MODULE_AUTHOR("Miklos Szeredi "); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + +/* private information held for overlayfs's superblock */ +struct ovl_fs { + struct vfsmount *upper_mnt; + struct vfsmount *lower_mnt; + struct dentry *workdir; +}; + +struct ovl_dir_cache; + +/* private information held for every overlayfs dentry */ +struct ovl_entry { + struct dentry *__upperdentry; + struct dentry *lowerdentry; + struct ovl_dir_cache *cache; + union { + struct { + u64 version; + bool opaque; + }; + struct rcu_head rcu; + }; +}; + +const char *ovl_opaque_xattr = "trusted.overlay.opaque"; + + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->__upperdentry) { + if (oe->lowerdentry) { + if (S_ISDIR(dentry->d_inode->i_mode)) + return OVL_PATH_MERGE; + else + return OVL_PATH_UPPER; + } else { + if (oe->opaque) + return OVL_PATH_UPPER; + else + return OVL_PATH_PURE_UPPER; + } + } else { + return OVL_PATH_LOWER; + } +} + +static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) +{ + struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); + /* + * Make sure to order reads to upperdentry wrt ovl_dentry_update() + */ + smp_read_barrier_depends(); + return upperdentry; +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_upperdentry_dereference(oe); +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + + enum ovl_path_type type = ovl_path_type(dentry); + + if (type == OVL_PATH_LOWER) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return ovl_upperdentry_dereference(oe); +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->lowerdentry; +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (!realdentry) + realdentry = oe->lowerdentry; + + return realdentry; +} + +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) +{ + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (realdentry) { + *is_upper = true; + } else { + realdentry = oe->lowerdentry; + *is_upper = false; + } + return realdentry; +} + +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->cache; +} + +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + oe->cache = cache; +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->lower_mnt; + path->dentry = oe->lowerdentry; +} + +int ovl_want_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return mnt_want_write(ofs->upper_mnt); +} + +void ovl_drop_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + mnt_drop_write(ofs->upper_mnt); +} + +struct dentry *ovl_workdir(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return ofs->workdir; +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + return oe->opaque; +} + +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) +{ + struct ovl_entry *oe = dentry->d_fsdata; + oe->opaque = opaque; +} + +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(oe->__upperdentry); + BUG_ON(!upperdentry->d_inode); + /* + * Make sure upperdentry is consistent before making it visible to + * ovl_upperdentry_dereference(). + */ + smp_wmb(); + oe->__upperdentry = upperdentry; +} + +void ovl_dentry_version_inc(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + oe->version++; +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + return oe->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + return inode && IS_WHITEOUT(inode); +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + int res; + char val; + struct inode *inode = dentry->d_inode; + + if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) + return false; + + res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + dput(oe->__upperdentry); + dput(oe->lowerdentry); + kfree_rcu(oe, rcu); + } +} + +static const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, +}; + +static struct ovl_entry *ovl_alloc_entry(void) +{ + return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); +} + +static inline struct dentry *ovl_lookup_real(struct dentry *dir, + struct qstr *name) +{ + struct dentry *dentry; + + mutex_lock(&dir->d_inode->i_mutex); + dentry = lookup_one_len(name->name, dir, name->len); + mutex_unlock(&dir->d_inode->i_mutex); + + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) == -ENOENT) + dentry = NULL; + } else if (!dentry->d_inode) { + dput(dentry); + dentry = NULL; + } + return dentry; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_entry *oe; + struct dentry *upperdir; + struct dentry *lowerdir; + struct dentry *upperdentry = NULL; + struct dentry *lowerdentry = NULL; + struct inode *inode = NULL; + int err; + + err = -ENOMEM; + oe = ovl_alloc_entry(); + if (!oe) + goto out; + + upperdir = ovl_dentry_upper(dentry->d_parent); + lowerdir = ovl_dentry_lower(dentry->d_parent); + + if (upperdir) { + upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto out_put_dir; + + if (lowerdir && upperdentry) { + if (ovl_is_whiteout(upperdentry)) { + dput(upperdentry); + upperdentry = NULL; + oe->opaque = true; + } else if (ovl_is_opaquedir(upperdentry)) { + oe->opaque = true; + } + } + } + if (lowerdir && !oe->opaque) { + lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); + err = PTR_ERR(lowerdentry); + if (IS_ERR(lowerdentry)) + goto out_dput_upper; + } + + if (lowerdentry && upperdentry && + (!S_ISDIR(upperdentry->d_inode->i_mode) || + !S_ISDIR(lowerdentry->d_inode->i_mode))) { + dput(lowerdentry); + lowerdentry = NULL; + oe->opaque = true; + } + + if (lowerdentry || upperdentry) { + struct dentry *realdentry; + + realdentry = upperdentry ? upperdentry : lowerdentry; + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, + oe); + if (!inode) + goto out_dput; + ovl_copyattr(realdentry->d_inode, inode); + } + + oe->__upperdentry = upperdentry; + oe->lowerdentry = lowerdentry; + + dentry->d_fsdata = oe; + d_add(dentry, inode); + + return NULL; + +out_dput: + dput(lowerdentry); +out_dput_upper: + dput(upperdentry); +out_put_dir: + kfree(oe); +out: + return ERR_PTR(err); +} + +struct file *ovl_path_open(struct path *path, int flags) +{ + return dentry_open(path, flags, current_cred()); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + dput(ufs->workdir); + mntput(ufs->upper_mnt); + mntput(ufs->lower_mnt); + + kfree(ufs); +} + +static const struct super_operations ovl_super_operations = { + .put_super = ovl_put_super, +}; + +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; +}; + +enum { + OPT_LOWERDIR, + OPT_UPPERDIR, + OPT_WORKDIR, + OPT_ERR, +}; + +static const match_table_t ovl_tokens = { + {OPT_LOWERDIR, "lowerdir=%s"}, + {OPT_UPPERDIR, "upperdir=%s"}, + {OPT_WORKDIR, "workdir=%s"}, + {OPT_ERR, NULL} +}; + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + + config->upperdir = NULL; + config->lowerdir = NULL; + config->workdir = NULL; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case OPT_UPPERDIR: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case OPT_LOWERDIR: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + case OPT_WORKDIR: + kfree(config->workdir); + config->workdir = match_strdup(&args[0]); + if (!config->workdir) + return -ENOMEM; + break; + + default: + return -EINVAL; + } + } + return 0; +} + +#define OVL_WORKDIR_NAME "work" + +static struct dentry *ovl_workdir_create(struct vfsmount *mnt, + struct dentry *dentry) +{ + struct inode *dir = dentry->d_inode; + struct dentry *work; + int err; + bool retried = false; + + err = mnt_want_write(mnt); + if (err) + return ERR_PTR(err); + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); +retry: + work = lookup_one_len(OVL_WORKDIR_NAME, dentry, + strlen(OVL_WORKDIR_NAME)); + + if (!IS_ERR(work)) { + struct kstat stat = { + .mode = S_IFDIR | 0, + }; + + if (work->d_inode) { + err = -EEXIST; + if (retried) + goto out_dput; + + retried = true; + ovl_cleanup(dir, work); + dput(work); + goto retry; + } + + err = ovl_create_real(dir, work, &stat, NULL, NULL, true); + if (err) + goto out_dput; + } +out_unlock: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(mnt); + + return work; + +out_dput: + dput(work); + work = ERR_PTR(err); + goto out_unlock; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err; + + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + err = -EINVAL; + } + return err; +} + +static bool ovl_is_allowed_fs_type(struct dentry *root) +{ + const struct dentry_operations *dop = root->d_op; + + /* + * We don't support: + * - automount filesystems + * - filesystems with revalidate (FIXME for lower layer) + * - filesystems with case insensitive names + */ + if (dop && + (dop->d_manage || dop->d_automount || + dop->d_revalidate || dop->d_weak_revalidate || + dop->d_compare || dop->d_hash)) { + return false; + } + return true; +} + +/* Workdir should not be subdir of upperdir and vice versa */ +static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) +{ + bool ok = false; + + if (workdir != upperdir) { + ok = (lock_rename(workdir, upperdir) == NULL); + unlock_rename(workdir, upperdir); + } + return ok; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path lowerpath; + struct path upperpath; + struct path workpath; + struct inode *root_inode; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ufs; + struct ovl_config config; + int err; + + err = ovl_parse_opt((char *) data, &config); + if (err) + goto out; + + /* FIXME: workdir is not needed for a R/O mount */ + err = -EINVAL; + if (!config.upperdir || !config.lowerdir || !config.workdir) { + pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); + goto out_free_config; + } + + err = -ENOMEM; + ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) + goto out_free_config; + + oe = ovl_alloc_entry(); + if (oe == NULL) + goto out_free_ufs; + + err = ovl_mount_dir(config.upperdir, &upperpath); + if (err) + goto out_free_oe; + + err = ovl_mount_dir(config.lowerdir, &lowerpath); + if (err) + goto out_put_upperpath; + + err = ovl_mount_dir(config.workdir, &workpath); + if (err) + goto out_put_lowerpath; + + err = -EINVAL; + if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || + !S_ISDIR(lowerpath.dentry->d_inode->i_mode) || + !S_ISDIR(workpath.dentry->d_inode->i_mode)) { + pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n"); + goto out_put_workpath; + } + + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + + if (!ovl_is_allowed_fs_type(upperpath.dentry)) { + pr_err("overlayfs: filesystem of upperdir is not supported\n"); + goto out_put_workpath; + } + + if (!ovl_is_allowed_fs_type(lowerpath.dentry)) { + pr_err("overlayfs: filesystem of lowerdir is not supported\n"); + goto out_put_workpath; + } + + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out_put_workpath; + } + + ufs->lower_mnt = clone_private_mount(&lowerpath); + err = PTR_ERR(ufs->lower_mnt); + if (IS_ERR(ufs->lower_mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_upper_mnt; + } + + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_err("overlayfs: failed to create directory %s/%s\n", + config.workdir, OVL_WORKDIR_NAME); + goto out_put_lower_mnt; + } + + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + ufs->lower_mnt->mnt_flags |= MNT_READONLY; + + /* If the upper fs is r/o, we mark overlayfs r/o too */ + if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) + sb->s_flags |= MS_RDONLY; + + sb->s_d_op = &ovl_dentry_operations; + + err = -ENOMEM; + root_inode = ovl_new_inode(sb, S_IFDIR, oe); + if (!root_inode) + goto out_put_workdir; + + root_dentry = d_make_root(root_inode); + if (!root_dentry) + goto out_put_workdir; + + mntput(upperpath.mnt); + mntput(lowerpath.mnt); + path_put(&workpath); + + oe->__upperdentry = upperpath.dentry; + oe->lowerdentry = lowerpath.dentry; + + root_dentry->d_fsdata = oe; + + sb->s_op = &ovl_super_operations; + sb->s_root = root_dentry; + sb->s_fs_info = ufs; + + return 0; + +out_put_workdir: + dput(ufs->workdir); +out_put_lower_mnt: + mntput(ufs->lower_mnt); +out_put_upper_mnt: + mntput(ufs->upper_mnt); +out_put_workpath: + path_put(&workpath); +out_put_lowerpath: + path_put(&lowerpath); +out_put_upperpath: + path_put(&upperpath); +out_free_oe: + kfree(oe); +out_free_ufs: + kfree(ufs); +out_free_config: + kfree(config.lowerdir); + kfree(config.upperdir); + kfree(config.workdir); +out: + return err; +} + +static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlayfs", + .mount = ovl_mount, + .kill_sb = kill_anon_super, +}; +MODULE_ALIAS_FS("overlayfs"); + +static int __init ovl_init(void) +{ + return register_filesystem(&ovl_fs_type); +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); +} + +module_init(ovl_init); +module_exit(ovl_exit); From a38696f51b951dddbba71e7a3cabb47d9347fa06 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 24 Oct 2014 00:14:38 +0200 Subject: [PATCH 138/236] overlayfs: add statfs support Add support for statfs to the overlayfs filesystem. As the upper layer is the target of all write operations assume that the space in that filesystem is the space in the overlayfs. There will be some inaccuracy as overwriting a file will copy it up and consume space we were not expecting, but it is better than nothing. Use the upper layer dentry and mount from the overlayfs root inode, passing the statfs call to that filesystem. Signed-off-by: Andy Whitcroft Signed-off-by: Miklos Szeredi (cherry picked from commit cc2596392af3b1404421aaef828a255303c46f93) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 227710aad781..aaf562b9f937 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -16,17 +16,21 @@ #include #include #include +#include #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); +#define OVERLAYFS_SUPER_MAGIC 0x794c764f + /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; struct vfsmount *lower_mnt; struct dentry *workdir; + long lower_namelen; }; struct ovl_dir_cache; @@ -383,8 +387,35 @@ static void ovl_put_super(struct super_block *sb) kfree(ufs); } +/** + * ovl_statfs + * @sb: The overlayfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. As writes always target the upper layer + * filesystem pass the statfs to the same filesystem. + */ +static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct dentry *root_dentry = dentry->d_sb->s_root; + struct path path; + int err; + + ovl_path_upper(root_dentry, &path); + + err = vfs_statfs(&path, buf); + if (!err) { + buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen); + buf->f_type = OVERLAYFS_SUPER_MAGIC; + } + + return err; +} + static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, + .statfs = ovl_statfs, }; struct ovl_config { @@ -556,6 +587,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct ovl_entry *oe; struct ovl_fs *ufs; struct ovl_config config; + struct kstatfs statfs; int err; err = ovl_parse_opt((char *) data, &config); @@ -617,6 +649,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_workpath; } + err = vfs_statfs(&lowerpath, &statfs); + if (err) { + pr_err("overlayfs: statfs failed on lowerpath\n"); + goto out_put_workpath; + } + ufs->lower_namelen = statfs.f_namelen; + ufs->upper_mnt = clone_private_mount(&upperpath); err = PTR_ERR(ufs->upper_mnt); if (IS_ERR(ufs->upper_mnt)) { @@ -669,6 +708,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) root_dentry->d_fsdata = oe; + sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; sb->s_root = root_dentry; sb->s_fs_info = ufs; From 2428e59706e6221ccebeee9c79870988485e2b06 Mon Sep 17 00:00:00 2001 From: Erez Zadok Date: Fri, 24 Oct 2014 00:14:38 +0200 Subject: [PATCH 139/236] overlayfs: implement show_options This is useful because of the stacking nature of overlayfs. Users like to find out (via /proc/mounts) which lower/upper directory were used at mount time. AV: even failing ovl_parse_opt() could've done some kstrdup() AV: failure of ovl_alloc_entry() should end up with ENOMEM, not EINVAL Signed-off-by: Erez Zadok Signed-off-by: Miklos Szeredi (cherry picked from commit f45827e84186af152492c6d0dcf4105b4a605f9b) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 76 ++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index aaf562b9f937..7dcc24e84417 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi "); @@ -25,12 +26,20 @@ MODULE_LICENSE("GPL"); #define OVERLAYFS_SUPER_MAGIC 0x794c764f +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; +}; + /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; struct vfsmount *lower_mnt; struct dentry *workdir; long lower_namelen; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; }; struct ovl_dir_cache; @@ -384,6 +393,9 @@ static void ovl_put_super(struct super_block *sb) mntput(ufs->upper_mnt); mntput(ufs->lower_mnt); + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); kfree(ufs); } @@ -413,15 +425,27 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +/** + * ovl_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ufs = sb->s_fs_info; + + seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + seq_printf(m, ",workdir=%s", ufs->config.workdir); + return 0; +} + static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, .statfs = ovl_statfs, -}; - -struct ovl_config { - char *lowerdir; - char *upperdir; - char *workdir; + .show_options = ovl_show_options, }; enum { @@ -442,10 +466,6 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; - config->upperdir = NULL; - config->lowerdir = NULL; - config->workdir = NULL; - while ((p = strsep(&opt, ",")) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -586,39 +606,40 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; - struct ovl_config config; struct kstatfs statfs; int err; - err = ovl_parse_opt((char *) data, &config); - if (err) + err = -ENOMEM; + ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) goto out; + err = ovl_parse_opt((char *) data, &ufs->config); + if (err) + goto out_free_config; + /* FIXME: workdir is not needed for a R/O mount */ err = -EINVAL; - if (!config.upperdir || !config.lowerdir || !config.workdir) { + if (!ufs->config.upperdir || !ufs->config.lowerdir || + !ufs->config.workdir) { pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); goto out_free_config; } err = -ENOMEM; - ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); - if (!ufs) - goto out_free_config; - oe = ovl_alloc_entry(); if (oe == NULL) - goto out_free_ufs; + goto out_free_config; - err = ovl_mount_dir(config.upperdir, &upperpath); + err = ovl_mount_dir(ufs->config.upperdir, &upperpath); if (err) goto out_free_oe; - err = ovl_mount_dir(config.lowerdir, &lowerpath); + err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); if (err) goto out_put_upperpath; - err = ovl_mount_dir(config.workdir, &workpath); + err = ovl_mount_dir(ufs->config.workdir, &workpath); if (err) goto out_put_lowerpath; @@ -674,7 +695,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(ufs->workdir); if (IS_ERR(ufs->workdir)) { pr_err("overlayfs: failed to create directory %s/%s\n", - config.workdir, OVL_WORKDIR_NAME); + ufs->config.workdir, OVL_WORKDIR_NAME); goto out_put_lower_mnt; } @@ -729,12 +750,11 @@ out_put_upperpath: path_put(&upperpath); out_free_oe: kfree(oe); -out_free_ufs: - kfree(ufs); out_free_config: - kfree(config.lowerdir); - kfree(config.upperdir); - kfree(config.workdir); + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); + kfree(ufs); out: return err; } From 8f963c31b87794a5c16a6ea155f8001cdd1d93c7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:39 +0200 Subject: [PATCH 140/236] fs: limit filesystem stacking depth Add a simple read-only counter to super_block that indicates how deep this is in the stack of filesystems. Previously ecryptfs was the only stackable filesystem and it explicitly disallowed multiple layers of itself. Overlayfs, however, can be stacked recursively and also may be stacked on top of ecryptfs or vice versa. To limit the kernel stack usage we must limit the depth of the filesystem stack. Initially the limit is set to 2. Signed-off-by: Miklos Szeredi (cherry picked from commit 69c433ed2ecd2d3264efd7afec4439524b319121) Signed-off-by: Alex Shi --- fs/ecryptfs/main.c | 7 +++++++ fs/overlayfs/super.c | 9 +++++++++ include/linux/fs.h | 11 +++++++++++ 3 files changed, 27 insertions(+) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 34eb8433d93f..d9eb84bda559 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -576,6 +576,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; + s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; + + rc = -EINVAL; + if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); + goto out_free; + } inode = ecryptfs_get_inode(path.dentry->d_inode, s); rc = PTR_ERR(inode); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7dcc24e84417..08b704cebfc4 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -677,6 +677,15 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } ufs->lower_namelen = statfs.f_namelen; + sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, + lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + + err = -EINVAL; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + goto out_put_workpath; + } + ufs->upper_mnt = clone_private_mount(&upperpath); err = PTR_ERR(ufs->upper_mnt); if (IS_ERR(ufs->upper_mnt)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 18492c801c39..429af8d8f2a2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -256,6 +256,12 @@ struct iattr { */ #include +/* + * Maximum number of layers of fs stack. Needs to be limited to + * prevent kernel stack overflow + */ +#define FILESYSTEM_MAX_STACK_DEPTH 2 + /** * enum positive_aop_returns - aop return codes with specific semantics * @@ -1336,6 +1342,11 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; }; extern struct timespec current_fs_time(struct super_block *sb); From a5613f10c5230eb50a863caf596774c50ec8f3e9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 22:56:05 -0400 Subject: [PATCH 141/236] overlayfs: don't hold ->i_mutex over opening the real directory just use it to serialize the assignment Signed-off-by: Al Viro (cherry picked from commit 3d268c9b136f51385f9d041f3f2424501b257388) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c6787f84ece9..b7d9fb098840 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -458,20 +458,27 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { struct inode *inode = file_inode(file); - mutex_lock(&inode->i_mutex); realfile = od->upperfile; if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); - if (IS_ERR(realfile)) { - mutex_unlock(&inode->i_mutex); - return PTR_ERR(realfile); + mutex_lock(&inode->i_mutex); + if (!od->upperfile) { + if (IS_ERR(realfile)) { + mutex_unlock(&inode->i_mutex); + return PTR_ERR(realfile); + } + od->upperfile = realfile; + } else { + /* somebody has beaten us to it */ + if (!IS_ERR(realfile)) + fput(realfile); + realfile = od->upperfile; } - od->upperfile = realfile; + mutex_unlock(&inode->i_mutex); } - mutex_unlock(&inode->i_mutex); } return vfs_fsync_range(realfile, start, end, datasync); From c0913e8f810834e7f4645050bfcb5e1c9f06fd53 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 22:58:56 -0400 Subject: [PATCH 142/236] overlayfs: make ovl_cache_entry->name an array instead of pointer Signed-off-by: Al Viro (cherry picked from commit 68bf8611076a8e4bee8bc8d03ff28bd1e9a9c631) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b7d9fb098840..9c9872be2c72 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -18,13 +18,13 @@ #include "overlayfs.h" struct ovl_cache_entry { - const char *name; unsigned int len; unsigned int type; u64 ino; bool is_whiteout; struct list_head l_node; struct rb_node node; + char name[]; }; struct ovl_dir_cache { @@ -82,13 +82,12 @@ static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; + size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); - p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL); + p = kmalloc(size, GFP_KERNEL); if (p) { - char *name_copy = (char *) (p + 1); - memcpy(name_copy, name, len); - name_copy[len] = '\0'; - p->name = name_copy; + memcpy(p->name, name, len); + p->name[len] = '\0'; p->len = len; p->type = d_type; p->ino = ino; From 90887f5c1bb98d0d51933e4d2238b5cc888b56cf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 23:00:53 -0400 Subject: [PATCH 143/236] overlayfs: embed root into overlay_readdir_data no sense having it a pointer - all instances have it pointing to local variable in the same stack frame Signed-off-by: Al Viro (cherry picked from commit 49be4fb9cc3431fc4ebc71c764db848483b2a16c) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 9c9872be2c72..a9ee2c1176fe 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -36,7 +36,7 @@ struct ovl_dir_cache { struct ovl_readdir_data { struct dir_context ctx; bool is_merge; - struct rb_root *root; + struct rb_root root; struct list_head *list; struct list_head *middle; int count; @@ -101,7 +101,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { - struct rb_node **newp = &rdd->root->rb_node; + struct rb_node **newp = &rdd->root.rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; @@ -126,7 +126,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); - rb_insert_color(&p->node, rdd->root); + rb_insert_color(&p->node, &rdd->root); return 0; } @@ -137,7 +137,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, { struct ovl_cache_entry *p; - p = ovl_cache_entry_find(rdd->root, name, namelen); + p = ovl_cache_entry_find(&rdd->root, name, namelen); if (p) { list_move_tail(&p->l_node, rdd->middle); } else { @@ -277,12 +277,11 @@ static inline int ovl_dir_read_merged(struct path *upperpath, struct list_head *list) { int err; - struct rb_root root = RB_ROOT; struct list_head middle; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, - .root = &root, + .root = RB_ROOT, .is_merge = false, }; From 258e15586ba135f8534ab416c647d2f195e12df1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 23:03:03 -0400 Subject: [PATCH 144/236] overlayfs: embed middle into overlay_readdir_data same story... Signed-off-by: Al Viro (cherry picked from commit db6ec212b53abc29a5bb6ac8c810010fc28d5191) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index a9ee2c1176fe..910553f37aca 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -38,7 +38,7 @@ struct ovl_readdir_data { bool is_merge; struct rb_root root; struct list_head *list; - struct list_head *middle; + struct list_head middle; int count; int err; }; @@ -139,13 +139,13 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, p = ovl_cache_entry_find(&rdd->root, name, namelen); if (p) { - list_move_tail(&p->l_node, rdd->middle); + list_move_tail(&p->l_node, &rdd->middle); } else { p = ovl_cache_entry_new(name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else - list_add_tail(&p->l_node, rdd->middle); + list_add_tail(&p->l_node, &rdd->middle); } return rdd->err; @@ -277,7 +277,6 @@ static inline int ovl_dir_read_merged(struct path *upperpath, struct list_head *list) { int err; - struct list_head middle; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, @@ -301,11 +300,10 @@ static inline int ovl_dir_read_merged(struct path *upperpath, * Insert lowerpath entries before upperpath ones, this allows * offsets to be reasonably constant */ - list_add(&middle, rdd.list); - rdd.middle = &middle; + list_add(&rdd.middle, rdd.list); rdd.is_merge = true; err = ovl_dir_read(lowerpath, &rdd); - list_del(&middle); + list_del(&rdd.middle); } out: return err; From 313adf61d46e2fc1092a03540f33b1f1345b8c44 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 28 Oct 2014 18:27:28 -0400 Subject: [PATCH 145/236] overlayfs: barriers for opening upper-layer directory make sure that a) all stores done by opening struct file don't leak past storing the reference in od->upperfile b) the lockless side has read dependency barrier Signed-off-by: Al Viro (cherry picked from commit d45f00ae43e63eff1b3d79df20610ae1ef645ebd) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 910553f37aca..8c8ce9d87ba3 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -454,12 +454,13 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { struct inode *inode = file_inode(file); - realfile = od->upperfile; + realfile =lockless_dereference(od->upperfile); if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); + smp_mb__before_spinlock(); mutex_lock(&inode->i_mutex); if (!od->upperfile) { if (IS_ERR(realfile)) { From 2b3b22f12809d5406745005ea0c20c4945dfe789 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 Oct 2014 13:48:48 +0100 Subject: [PATCH 146/236] ovl: fix check for cursor ovl_cache_entry.name is now an array not a pointer, so it makes no sense test for it being NULL. Detected by coverity. From: Miklos Szeredi Fixes: 68bf8611076a ("overlayfs: make ovl_cache_entry->name an array instead of +pointer") Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro (cherry picked from commit c2096537d40f026672c4c6adfcd7247ce5799604) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 8c8ce9d87ba3..3fbf0d306e12 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -21,9 +21,10 @@ struct ovl_cache_entry { unsigned int len; unsigned int type; u64 ino; - bool is_whiteout; struct list_head l_node; struct rb_node node; + bool is_whiteout; + bool is_cursor; char name[]; }; @@ -251,7 +252,7 @@ static int ovl_dir_mark_whiteouts(struct dentry *dir, mutex_lock(&dir->d_inode->i_mutex); list_for_each_entry(p, rdd->list, l_node) { - if (!p->name) + if (p->is_cursor) continue; if (p->type != DT_CHR) @@ -307,7 +308,6 @@ static inline int ovl_dir_read_merged(struct path *upperpath, } out: return err; - } static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) @@ -316,7 +316,7 @@ static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) loff_t off = 0; list_for_each_entry(p, &od->cache->entries, l_node) { - if (!p->name) + if (p->is_cursor) continue; if (off >= pos) break; @@ -389,7 +389,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node); /* Skip cursors */ - if (p->name) { + if (!p->is_cursor) { if (!p->is_whiteout) { if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; @@ -519,6 +519,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file) od->realfile = realfile; od->is_real = (type != OVL_PATH_MERGE); od->is_upper = (type != OVL_PATH_LOWER); + od->cursor.is_cursor = true; file->private_data = od; return 0; From 9ba0656673b59bcae46ce2a7849005a07d494fac Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 Oct 2014 15:42:01 +0100 Subject: [PATCH 147/236] overlayfs: fix lockdep misannotation In an overlay directory that shadows an empty lower directory, say /mnt/a/empty102, do: touch /mnt/a/empty102/x unlink /mnt/a/empty102/x rmdir /mnt/a/empty102 It's actually harmless, but needs another level of nesting between I_MUTEX_CHILD and I_MUTEX_NORMAL. Signed-off-by: Miklos Szeredi Tested-by: David Howells Signed-off-by: Al Viro (cherry picked from commit d1b72cc6d8cb766c802fdc70a5edc2f0ba8a2b57) Signed-off-by: Alex Shi --- fs/namei.c | 2 +- fs/overlayfs/readdir.c | 2 +- include/linux/fs.h | 9 ++++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index a047265e629e..0fbf1504fb63 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2473,7 +2473,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) } mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); return NULL; } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 3fbf0d306e12..401f0840f5cc 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -571,7 +571,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; diff --git a/include/linux/fs.h b/include/linux/fs.h index 429af8d8f2a2..1ec29cc21c6f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -641,11 +641,13 @@ static inline int inode_unhashed(struct inode *inode) * 2: child/target * 3: xattr * 4: second non-directory - * The last is for certain operations (such as rename) which lock two + * 5: second parent (when locking independent directories in rename) + * + * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is - * parent -> child -> normal -> xattr -> second non-directory + * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { @@ -653,7 +655,8 @@ enum inode_i_mutex_lock_class I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, - I_MUTEX_NONDIR2 + I_MUTEX_NONDIR2, + I_MUTEX_PARENT2, }; void lock_two_nondirectories(struct inode *, struct inode*); From a996f76df429a061985a5e7b1904c6574315a818 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 31 Oct 2014 20:02:42 +0100 Subject: [PATCH 148/236] ovl: initialize ->is_cursor Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro (cherry picked from commit 9f2f7d4c8dfcf4617af5de6ea381b91deac3db48) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 401f0840f5cc..4e9d7c1fea52 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -93,6 +93,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, p->type = d_type; p->ino = ino; p->is_whiteout = false; + p->is_cursor = false; } return p; From 30d1df13eef6be298b5f783a2b36b82cf1c0f033 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 30 Oct 2014 17:37:34 +0100 Subject: [PATCH 149/236] vfs: make first argument of dir_context.actor typed Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro (cherry picked from commit ac7576f4b1da8c9c6bc1ae026c2b9e86ae617ba5) Signed-off-by: Alex Shi --- arch/alpha/kernel/osf_sys.c | 7 ++++--- arch/parisc/hpux/fs.c | 7 ++++--- .../staging/lustre/lustre/llite/llite_nfs.c | 8 ++++--- fs/afs/dir.c | 9 ++++---- fs/compat.c | 21 ++++++++++++------- fs/ecryptfs/file.c | 6 +++--- fs/exportfs/expfs.c | 5 +++-- fs/fat/dir.c | 5 +++-- fs/gfs2/export.c | 8 ++++--- fs/hppfs/hppfs.c | 5 +++-- fs/nfsd/nfs4recover.c | 5 +++-- fs/nfsd/vfs.c | 12 ++++++----- fs/nfsd/vfs.h | 4 ++-- fs/ocfs2/dir.c | 8 ++++--- fs/ocfs2/journal.c | 8 ++++--- fs/overlayfs/readdir.c | 8 ++++--- fs/readdir.c | 21 +++++++++++-------- fs/reiserfs/xattr.c | 15 +++++++------ include/linux/fs.h | 5 ++++- 19 files changed, 100 insertions(+), 67 deletions(-) diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 1402fcc11c2c..98f57d34923a 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -104,11 +104,12 @@ struct osf_dirent_callback { }; static int -osf_filldir(void *__buf, const char *name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +osf_filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) { struct osf_dirent __user *dirent; - struct osf_dirent_callback *buf = (struct osf_dirent_callback *) __buf; + struct osf_dirent_callback *buf = + container_of(ctx, struct osf_dirent_callback, ctx); unsigned int reclen = ALIGN(NAME_OFFSET + namlen + 1, sizeof(u32)); unsigned int d_ino; diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c index 2bedafea3d94..97a7bf8df348 100644 --- a/arch/parisc/hpux/fs.c +++ b/arch/parisc/hpux/fs.c @@ -56,11 +56,12 @@ struct getdents_callback { #define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de))) -static int filldir(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned d_type) +static int filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned d_type) { struct hpux_dirent __user * dirent; - struct getdents_callback * buf = (struct getdents_callback *) __buf; + struct getdents_callback *buf = + container_of(ctx, struct getdents_callback, ctx); ino_t d_ino; int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(long)); diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c index 1767c741fb72..ed35a88c6dd9 100644 --- a/drivers/staging/lustre/lustre/llite/llite_nfs.c +++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c @@ -205,13 +205,15 @@ static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen, return LUSTRE_NFS_FID; } -static int ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen, - loff_t hash, u64 ino, unsigned type) +static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, + int namelen, loff_t hash, u64 ino, + unsigned type) { /* It is hack to access lde_fid for comparison with lgd_fid. * So the input 'name' must be part of the 'lu_dirent'. */ struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name); - struct ll_getname_data *lgd = cookie; + struct ll_getname_data *lgd = + container_of(ctx, struct ll_getname_data, ctx); struct lu_fid fid; fid_le_to_cpu(&fid, &lde->lde_fid); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 529300327f45..5479af6de6ce 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -26,7 +26,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); -static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); @@ -391,10 +391,11 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, - loff_t fpos, u64 ino, unsigned dtype) +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_lookup_cookie *cookie = _cookie; + struct afs_lookup_cookie *cookie = + container_of(ctx, struct afs_lookup_cookie, ctx); _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, diff --git a/fs/compat.c b/fs/compat.c index 6af20de2c1a3..14da9b327ffa 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -837,10 +837,12 @@ struct compat_readdir_callback { int result; }; -static int compat_fillonedir(void *__buf, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type) +static int compat_fillonedir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) { - struct compat_readdir_callback *buf = __buf; + struct compat_readdir_callback *buf = + container_of(ctx, struct compat_readdir_callback, ctx); struct compat_old_linux_dirent __user *dirent; compat_ulong_t d_ino; @@ -905,11 +907,12 @@ struct compat_getdents_callback { int error; }; -static int compat_filldir(void *__buf, const char *name, int namlen, +static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user * dirent; - struct compat_getdents_callback *buf = __buf; + struct compat_getdents_callback *buf = + container_of(ctx, struct compat_getdents_callback, ctx); compat_ulong_t d_ino; int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); @@ -991,11 +994,13 @@ struct compat_getdents_callback64 { int error; }; -static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +static int compat_filldir64(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) { struct linux_dirent64 __user *dirent; - struct compat_getdents_callback64 *buf = __buf; + struct compat_getdents_callback64 *buf = + container_of(ctx, struct compat_getdents_callback64, ctx); int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); u64 off; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 03df50211c48..121a94876d40 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -76,11 +76,11 @@ struct ecryptfs_getdents_callback { /* Inspired by generic filldir in fs/readdir.c */ static int -ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, - loff_t offset, u64 ino, unsigned int d_type) +ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, + int lower_namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ecryptfs_getdents_callback *buf = - (struct ecryptfs_getdents_callback *)dirent; + container_of(ctx, struct ecryptfs_getdents_callback, ctx); size_t name_size; char *name; int rc; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 48a359dd286e..59d339c417c9 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -241,10 +241,11 @@ struct getdents_callback { * A rather strange filldir function to capture * the name matching the specified inode number. */ -static int filldir_one(void * __buf, const char * name, int len, +static int filldir_one(struct dir_context *ctx, const char *name, int len, loff_t pos, u64 ino, unsigned int d_type) { - struct getdents_callback *buf = __buf; + struct getdents_callback *buf = + container_of(ctx, struct getdents_callback, ctx); int result = 0; buf->sequence++; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 3963ede84eb0..c5d6bb939d19 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -702,10 +702,11 @@ static int fat_readdir(struct file *file, struct dir_context *ctx) } #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ -static int func(void *__buf, const char *name, int name_len, \ +static int func(struct dir_context *ctx, const char *name, int name_len, \ loff_t offset, u64 ino, unsigned int d_type) \ { \ - struct fat_ioctl_filldir_callback *buf = __buf; \ + struct fat_ioctl_filldir_callback *buf = \ + container_of(ctx, struct fat_ioctl_filldir_callback, ctx); \ struct dirent_type __user *d1 = buf->dirent; \ struct dirent_type __user *d2 = d1 + 1; \ \ diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 8b9b3775e2e7..c41d255b6a7b 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -69,10 +69,12 @@ struct get_name_filldir { char *name; }; -static int get_name_filldir(void *opaque, const char *name, int length, - loff_t offset, u64 inum, unsigned int type) +static int get_name_filldir(struct dir_context *ctx, const char *name, + int length, loff_t offset, u64 inum, + unsigned int type) { - struct get_name_filldir *gnfd = opaque; + struct get_name_filldir *gnfd = + container_of(ctx, struct get_name_filldir, ctx); if (inum != gnfd->inum.no_addr) return 0; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 4338ff32959d..5f2755117ce7 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -548,10 +548,11 @@ struct hppfs_dirent { struct dentry *dentry; }; -static int hppfs_filldir(void *d, const char *name, int size, +static int hppfs_filldir(struct dir_context *ctx, const char *name, int size, loff_t offset, u64 inode, unsigned int type) { - struct hppfs_dirent *dirent = d; + struct hppfs_dirent *dirent = + container_of(ctx, struct hppfs_dirent, ctx); if (file_removed(dirent->dentry, name)) return 0; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9c271f42604a..674a5d529e09 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -244,10 +244,11 @@ struct nfs4_dir_ctx { }; static int -nfsd4_build_namelist(void *arg, const char *name, int namlen, +nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct nfs4_dir_ctx *ctx = arg; + struct nfs4_dir_ctx *ctx = + container_of(__ctx, struct nfs4_dir_ctx, ctx); struct name_list *entry; if (namlen != HEXDIR_LEN - 1) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 464f813a2544..3953a20db16d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1808,10 +1808,12 @@ struct readdir_data { int full; }; -static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type) +static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) { - struct readdir_data *buf = __buf; + struct readdir_data *buf = + container_of(ctx, struct readdir_data, ctx); struct buffered_dirent *de = (void *)(buf->dirent + buf->used); unsigned int reclen; @@ -1831,7 +1833,7 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, return 0; } -static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, +static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, struct readdir_cd *cdp, loff_t *offsetp) { struct buffered_dirent *de; @@ -1915,7 +1917,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, */ __be32 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, - struct readdir_cd *cdp, filldir_t func) + struct readdir_cd *cdp, nfsd_filldir_t func) { __be32 err; struct file *file; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index fbe90bdb2214..ea760b17ecbd 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -36,7 +36,7 @@ /* * Callback function for readdir */ -typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); +typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); /* nfsd/vfs.c */ int nfsd_racache_init(int); @@ -89,7 +89,7 @@ __be32 nfsd_rename(struct svc_rqst *, __be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, char *name, int len); __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, - loff_t *, struct readdir_cd *, filldir_t); + loff_t *, struct readdir_cd *, nfsd_filldir_t); __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, struct kstatfs *, int access); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 91a7e85ac8fd..478e14d7be54 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2073,10 +2073,12 @@ struct ocfs2_empty_dir_priv { unsigned seen_other; unsigned dx_dir; }; -static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, - loff_t pos, u64 ino, unsigned type) +static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, + int name_len, loff_t pos, u64 ino, + unsigned type) { - struct ocfs2_empty_dir_priv *p = priv; + struct ocfs2_empty_dir_priv *p = + container_of(ctx, struct ocfs2_empty_dir_priv, ctx); /* * Check the positions of "." and ".." records to be sure diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 44fc3e530c3d..a1b7dca075dd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1981,10 +1981,12 @@ struct ocfs2_orphan_filldir_priv { struct ocfs2_super *osb; }; -static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len, - loff_t pos, u64 ino, unsigned type) +static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, + int name_len, loff_t pos, u64 ino, + unsigned type) { - struct ocfs2_orphan_filldir_priv *p = priv; + struct ocfs2_orphan_filldir_priv *p = + container_of(ctx, struct ocfs2_orphan_filldir_priv, ctx); struct inode *iter; if (name_len == 1 && !strncmp(".", name, 1)) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 4e9d7c1fea52..301f64aa8a45 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -180,10 +180,12 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) } } -static int ovl_fill_merge(void *buf, const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) +static int ovl_fill_merge(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) { - struct ovl_readdir_data *rdd = buf; + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; if (!rdd->is_merge) diff --git a/fs/readdir.c b/fs/readdir.c index 5b53d995cae6..e21af53ec24e 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -72,10 +72,11 @@ struct readdir_callback { int result; }; -static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +static int fillonedir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) { - struct readdir_callback *buf = (struct readdir_callback *) __buf; + struct readdir_callback *buf = + container_of(ctx, struct readdir_callback, ctx); struct old_linux_dirent __user * dirent; unsigned long d_ino; @@ -146,11 +147,12 @@ struct getdents_callback { int error; }; -static int filldir(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +static int filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent __user * dirent; - struct getdents_callback * buf = (struct getdents_callback *) __buf; + struct getdents_callback *buf = + container_of(ctx, struct getdents_callback, ctx); unsigned long d_ino; int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); @@ -230,11 +232,12 @@ struct getdents_callback64 { int error; }; -static int filldir64(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +static int filldir64(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent; - struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; + struct getdents_callback64 *buf = + container_of(ctx, struct getdents_callback64, ctx); int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 5cdfbd638b5c..d8b7acfd5066 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -177,10 +177,11 @@ struct reiserfs_dentry_buf { }; static int -fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, - u64 ino, unsigned int d_type) +fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) { - struct reiserfs_dentry_buf *dbuf = buf; + struct reiserfs_dentry_buf *dbuf = + container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); @@ -794,10 +795,12 @@ struct listxattr_buf { struct dentry *dentry; }; -static int listxattr_filler(void *buf, const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) +static int listxattr_filler(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) { - struct listxattr_buf *b = (struct listxattr_buf *)buf; + struct listxattr_buf *b = + container_of(ctx, struct listxattr_buf, ctx); size_t size; if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ec29cc21c6f..e1cee8b12d1a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1530,7 +1530,10 @@ int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. */ -typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned); +struct dir_context; +typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, + unsigned); + struct dir_context { const filldir_t actor; loff_t pos; From 68a4dee66f6231bb2a5058c58e9e9ebea8149b5c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 4 Nov 2014 16:11:03 +0100 Subject: [PATCH 150/236] ovl: don't poison cursor ovl_cache_put() can be called from ovl_dir_reset() if the cache needs to be rebuilt. We did list_del() on the cursor, which results in an Oops on the poisoned pointer in ovl_seek_cursor(). Reported-by: Jordi Pujol Palomer Signed-off-by: Miklos Szeredi Tested-by: Jordi Pujol Palomer Signed-off-by: Al Viro (cherry picked from commit 3f822c6264954660babce757fb45792fd3af273e) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 301f64aa8a45..fdb63deff1a7 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -168,7 +168,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) { struct ovl_dir_cache *cache = od->cache; - list_del(&od->cursor.l_node); + list_del_init(&od->cursor.l_node); WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { From 5c5ce110a498fc025c7ca85165ba2f5fa1ddef43 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:39:59 +0100 Subject: [PATCH 151/236] ovl: rename filesystem type to "overlay" Some distributions carry an "old" format of overlayfs while mainline has a "new" format. The distros will possibly want to keep the old overlayfs alongside the new for compatibility reasons. To make it possible to differentiate the two versions change the name of the new one from "overlayfs" to "overlay". Signed-off-by: Miklos Szeredi Reported-by: Serge Hallyn Cc: Andy Whitcroft (cherry picked from commit ef94b1864d1ed5be54376404bb23d22ed0481feb) Signed-off-by: Alex Shi --- Documentation/filesystems/overlayfs.txt | 198 ++++++++++++++++++++++++ MAINTAINERS | 7 + fs/Makefile | 2 +- fs/overlayfs/Kconfig | 2 +- fs/overlayfs/Makefile | 4 +- fs/overlayfs/super.c | 6 +- 6 files changed, 212 insertions(+), 7 deletions(-) create mode 100644 Documentation/filesystems/overlayfs.txt diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt new file mode 100644 index 000000000000..a27c950ece61 --- /dev/null +++ b/Documentation/filesystems/overlayfs.txt @@ -0,0 +1,198 @@ +Written by: Neil Brown + +Overlay Filesystem +================== + +This document describes a prototype for a new approach to providing +overlay-filesystem functionality in Linux (sometimes referred to as +union-filesystems). An overlay-filesystem tries to present a +filesystem which is the result over overlaying one filesystem on top +of the other. + +The result will inevitably fail to look exactly like a normal +filesystem for various technical reasons. The expectation is that +many use cases will be able to ignore these differences. + +This approach is 'hybrid' because the objects that appear in the +filesystem do not all appear to belong to that filesystem. In many +cases an object accessed in the union will be indistinguishable +from accessing the corresponding object from the original filesystem. +This is most obvious from the 'st_dev' field returned by stat(2). + +While directories will report an st_dev from the overlay-filesystem, +all non-directory objects will report an st_dev from the lower or +upper filesystem that is providing the object. Similarly st_ino will +only be unique when combined with st_dev, and both of these can change +over the lifetime of a non-directory object. Many applications and +tools ignore these values and will not be affected. + +Upper and Lower +--------------- + +An overlay filesystem combines two filesystems - an 'upper' filesystem +and a 'lower' filesystem. When a name exists in both filesystems, the +object in the 'upper' filesystem is visible while the object in the +'lower' filesystem is either hidden or, in the case of directories, +merged with the 'upper' object. + +It would be more correct to refer to an upper and lower 'directory +tree' rather than 'filesystem' as it is quite possible for both +directory trees to be in the same filesystem and there is no +requirement that the root of a filesystem be given for either upper or +lower. + +The lower filesystem can be any filesystem supported by Linux and does +not need to be writable. The lower filesystem can even be another +overlayfs. The upper filesystem will normally be writable and if it +is it must support the creation of trusted.* extended attributes, and +must provide valid d_type in readdir responses, so NFS is not suitable. + +A read-only overlay of two read-only filesystems may use any +filesystem type. + +Directories +----------- + +Overlaying mainly involves directories. If a given name appears in both +upper and lower filesystems and refers to a non-directory in either, +then the lower object is hidden - the name refers only to the upper +object. + +Where both upper and lower objects are directories, a merged directory +is formed. + +At mount time, the two directories given as mount options "lowerdir" and +"upperdir" are combined into a merged directory: + + mount -t overlay overlay -olowerdir=/lower,upperdir=/upper,\ +workdir=/work /merged + +The "workdir" needs to be an empty directory on the same filesystem +as upperdir. + +Then whenever a lookup is requested in such a merged directory, the +lookup is performed in each actual directory and the combined result +is cached in the dentry belonging to the overlay filesystem. If both +actual lookups find directories, both are stored and a merged +directory is created, otherwise only one is stored: the upper if it +exists, else the lower. + +Only the lists of names from directories are merged. Other content +such as metadata and extended attributes are reported for the upper +directory only. These attributes of the lower directory are hidden. + +whiteouts and opaque directories +-------------------------------- + +In order to support rm and rmdir without changing the lower +filesystem, an overlay filesystem needs to record in the upper filesystem +that files have been removed. This is done using whiteouts and opaque +directories (non-directories are always opaque). + +A whiteout is created as a character device with 0/0 device number. +When a whiteout is found in the upper level of a merged directory, any +matching name in the lower level is ignored, and the whiteout itself +is also hidden. + +A directory is made opaque by setting the xattr "trusted.overlay.opaque" +to "y". Where the upper filesystem contains an opaque directory, any +directory in the lower filesystem with the same name is ignored. + +readdir +------- + +When a 'readdir' request is made on a merged directory, the upper and +lower directories are each read and the name lists merged in the +obvious way (upper is read first, then lower - entries that already +exist are not re-added). This merged name list is cached in the +'struct file' and so remains as long as the file is kept open. If the +directory is opened and read by two processes at the same time, they +will each have separate caches. A seekdir to the start of the +directory (offset 0) followed by a readdir will cause the cache to be +discarded and rebuilt. + +This means that changes to the merged directory do not appear while a +directory is being read. This is unlikely to be noticed by many +programs. + +seek offsets are assigned sequentially when the directories are read. +Thus if + - read part of a directory + - remember an offset, and close the directory + - re-open the directory some time later + - seek to the remembered offset + +there may be little correlation between the old and new locations in +the list of filenames, particularly if anything has changed in the +directory. + +Readdir on directories that are not merged is simply handled by the +underlying directory (upper or lower). + + +Non-directories +--------------- + +Objects that are not directories (files, symlinks, device-special +files etc.) are presented either from the upper or lower filesystem as +appropriate. When a file in the lower filesystem is accessed in a way +the requires write-access, such as opening for write access, changing +some metadata etc., the file is first copied from the lower filesystem +to the upper filesystem (copy_up). Note that creating a hard-link +also requires copy_up, though of course creation of a symlink does +not. + +The copy_up may turn out to be unnecessary, for example if the file is +opened for read-write but the data is not modified. + +The copy_up process first makes sure that the containing directory +exists in the upper filesystem - creating it and any parents as +necessary. It then creates the object with the same metadata (owner, +mode, mtime, symlink-target etc.) and then if the object is a file, the +data is copied from the lower to the upper filesystem. Finally any +extended attributes are copied up. + +Once the copy_up is complete, the overlay filesystem simply +provides direct access to the newly created file in the upper +filesystem - future operations on the file are barely noticed by the +overlay filesystem (though an operation on the name of the file such as +rename or unlink will of course be noticed and handled). + + +Non-standard behavior +--------------------- + +The copy_up operation essentially creates a new, identical file and +moves it over to the old name. The new file may be on a different +filesystem, so both st_dev and st_ino of the file may change. + +Any open files referring to this inode will access the old data and +metadata. Similarly any file locks obtained before copy_up will not +apply to the copied up file. + +On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and +fsetxattr(2) will fail with EROFS. + +If a file with multiple hard links is copied up, then this will +"break" the link. Changes will not be propagated to other names +referring to the same inode. + +Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory +object in overlayfs will not contain valid absolute paths, only +relative paths leading up to the filesystem's root. This will be +fixed in the future. + +Some operations are not atomic, for example a crash during copy_up or +rename will leave the filesystem in an inconsistent state. This will +be addressed in the future. + +Changes to underlying filesystems +--------------------------------- + +Offline changes, when the overlay is not mounted, are allowed to either +the upper or the lower trees. + +Changes to the underlying filesystems while part of a mounted overlay +filesystem are not allowed. If the underlying filesystem is changed, +the behavior of the overlay is undefined, though it will not result in +a crash or deadlock. diff --git a/MAINTAINERS b/MAINTAINERS index 900d98eec2fc..976dae0ad2e5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6452,6 +6452,13 @@ F: drivers/scsi/osd/ F: include/scsi/osd_* F: fs/exofs/ +OVERLAY FILESYSTEM +M: Miklos Szeredi +L: linux-fsdevel@vger.kernel.org +S: Supported +F: fs/overlayfs/* +F: Documentation/filesystems/overlayfs.txt + P54 WIRELESS DRIVER M: Christian Lamparter L: linux-wireless@vger.kernel.org diff --git a/fs/Makefile b/fs/Makefile index 5eb9caf136b4..b5954404e02d 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -104,7 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ -obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ +obj-$(CONFIG_OVERLAY_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index e60125976873..34355818a2e0 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,4 +1,4 @@ -config OVERLAYFS_FS +config OVERLAY_FS tristate "Overlay filesystem support" help An overlay filesystem combines two filesystems - an 'upper' filesystem diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 8f91889480d0..900daed3e91d 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -2,6 +2,6 @@ # Makefile for the overlay filesystem. # -obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o +obj-$(CONFIG_OVERLAY_FS) += overlay.o -overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o +overlay-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 08b704cebfc4..b92bd1829cf7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -24,7 +24,7 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); -#define OVERLAYFS_SUPER_MAGIC 0x794c764f +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 struct ovl_config { char *lowerdir; @@ -776,11 +776,11 @@ static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, static struct file_system_type ovl_fs_type = { .owner = THIS_MODULE, - .name = "overlayfs", + .name = "overlay", .mount = ovl_mount, .kill_sb = kill_anon_super, }; -MODULE_ALIAS_FS("overlayfs"); +MODULE_ALIAS_FS("overlay"); static int __init ovl_init(void) { From 67b24788bb53d0b36bad84205ab940718003bffc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:39:59 +0100 Subject: [PATCH 152/236] ovl: fix remove/copy-up race ovl_remove_and_whiteout() needs to check if upper dentry exists or not after having locked upper parent directory. Previously we used a "type" value computed before locking the upper parent directory, which is susceptible to racing with copy-up. There's a similar check in ovl_check_empty_and_clear(). This one is not actually racy, since copy-up doesn't change the "emptyness" property of a directory. Add a comment to this effect, and check the existence of upper dentry locally to make the code cleaner. Signed-off-by: Miklos Szeredi (cherry picked from commit a105d685a8483985a01776411de191a726b48132) Signed-off-by: Alex Shi --- fs/overlayfs/dir.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 15cd91ad9940..8ffc4b980f1b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -284,8 +284,7 @@ out: return ERR_PTR(err); } -static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, - enum ovl_path_type type) +static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) { int err; struct dentry *ret = NULL; @@ -294,8 +293,17 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, err = ovl_check_empty_dir(dentry, &list); if (err) ret = ERR_PTR(err); - else if (type == OVL_PATH_MERGE) - ret = ovl_clear_empty(dentry, &list); + else { + /* + * If no upperdentry then skip clearing whiteouts. + * + * Can race with copy-up, since we don't hold the upperdir + * mutex. Doesn't matter, since copy-up can't create a + * non-empty directory from an empty one. + */ + if (ovl_dentry_upper(dentry)) + ret = ovl_clear_empty(dentry, &list); + } ovl_cache_free(&list); @@ -487,8 +495,7 @@ out: return err; } -static int ovl_remove_and_whiteout(struct dentry *dentry, - enum ovl_path_type type, bool is_dir) +static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) { struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; @@ -500,7 +507,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, int err; if (is_dir) { - opaquedir = ovl_check_empty_and_clear(dentry, type); + opaquedir = ovl_check_empty_and_clear(dentry); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out; @@ -515,9 +522,10 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, if (IS_ERR(whiteout)) goto out_unlock; - if (type == OVL_PATH_LOWER) { + upper = ovl_dentry_upper(dentry); + if (!upper) { upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto kill_whiteout; @@ -529,7 +537,6 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, } else { int flags = 0; - upper = ovl_dentry_upper(dentry); if (opaquedir) upper = opaquedir; err = -ESTALE; @@ -648,7 +655,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) cap_raise(override_cred->cap_effective, CAP_CHOWN); old_cred = override_creds(override_cred); - err = ovl_remove_and_whiteout(dentry, type, is_dir); + err = ovl_remove_and_whiteout(dentry, is_dir); revert_creds(old_cred); put_cred(override_cred); @@ -781,7 +788,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, } if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { - opaquedir = ovl_check_empty_and_clear(new, new_type); + opaquedir = ovl_check_empty_and_clear(new); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { opaquedir = NULL; From ee3679843c9d6dec75e1d260cb6f041aa36ccbed Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:00 +0100 Subject: [PATCH 153/236] ovl: fix race in private xattr checks Xattr operations can race with copy up. This does not matter as long as we consistently fiter out "trunsted.overlay.opaque" attribute on upper directories. Previously we checked parent against OVL_PATH_MERGE. This is too general, and prone to race with copy-up. I.e. we found the parent to be on the lower layer but ovl_dentry_real() would return the copied-up dentry, possibly with the "opaque" attribute. So instead use ovl_path_real() and decide to filter the attributes based on the actual type of the dentry we'll use. Signed-off-by: Miklos Szeredi (cherry picked from commit 521484639ec19a6f1ed56de6993feb255f5f676c) Signed-off-by: Alex Shi --- fs/overlayfs/inode.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index af2d18c9fcee..07d74b24913b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -235,26 +235,36 @@ out: return err; } +static bool ovl_need_xattr_filter(struct dentry *dentry, + enum ovl_path_type type) +{ + return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode); +} + ssize_t ovl_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { - if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && - ovl_is_private_xattr(name)) + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); + + if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) return -ENODATA; - return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); + return vfs_getxattr(realpath.dentry, name, value, size); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); ssize_t res; int off; - res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + res = vfs_listxattr(realpath.dentry, list, size); if (res <= 0 || size == 0) return res; - if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + if (!ovl_need_xattr_filter(dentry, type)) return res; /* filter out private xattrs */ @@ -279,17 +289,16 @@ int ovl_removexattr(struct dentry *dentry, const char *name) { int err; struct path realpath; - enum ovl_path_type type; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); err = ovl_want_write(dentry); if (err) goto out; - if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && - ovl_is_private_xattr(name)) + err = -ENODATA; + if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) goto out_drop_write; - type = ovl_path_real(dentry, &realpath); if (type == OVL_PATH_LOWER) { err = vfs_getxattr(realpath.dentry, name, NULL, 0); if (err < 0) From 0ca1a2c9b8357ebf224d1714a6a1e3635e8e5862 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:00 +0100 Subject: [PATCH 154/236] ovl: allow filenames with comma Allow option separator (comma) to be escaped with backslash. Signed-off-by: Miklos Szeredi (cherry picked from commit 91c77947133f7aef851b625701e182d3f99d14a9) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 48 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b92bd1829cf7..eee7a62e1c0e 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -462,11 +462,34 @@ static const match_table_t ovl_tokens = { {OPT_ERR, NULL} }; +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; - while ((p = strsep(&opt, ",")) != NULL) { + while ((p = ovl_next_opt(&opt)) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -554,15 +577,34 @@ out_dput: goto out_unlock; } +static void ovl_unescape(char *s) +{ + char *d = s; + + for (;; s++, d++) { + if (*s == '\\') + s++; + *d = *s; + if (!*s) + break; + } +} + static int ovl_mount_dir(const char *name, struct path *path) { int err; + char *tmp = kstrdup(name, GFP_KERNEL); - err = kern_path(name, LOOKUP_FOLLOW, path); + if (!tmp) + return -ENOMEM; + + ovl_unescape(tmp); + err = kern_path(tmp, LOOKUP_FOLLOW, path); if (err) { - pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err); err = -EINVAL; } + kfree(tmp); return err; } From a4180dfef4dc3c948d524dfe0c168cb59317df56 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:01 +0100 Subject: [PATCH 155/236] ovl: use lockless_dereference() for upperdentry Don't open code lockless_dereference() in ovl_upperdentry_dereference(). Signed-off-by: Miklos Szeredi (cherry picked from commit 71d509280f7e92eb60ae6b7c78c20afafff060c7) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index eee7a62e1c0e..f16d318b71f8 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -84,12 +84,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) { - struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); - /* - * Make sure to order reads to upperdentry wrt ovl_dentry_update() - */ - smp_read_barrier_depends(); - return upperdentry; + return lockless_dereference(oe->__upperdentry); } void ovl_path_upper(struct dentry *dentry, struct path *path) From d3a45e42dd711ce4e2095f14d77865b78c749190 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:01 +0100 Subject: [PATCH 156/236] ovl: pass dentry into ovl_dir_read_merged() Pass dentry into ovl_dir_read_merged() insted of upperpath and lowerpath. This cleans up callers and paves the way for multi-layer directory reads. Signed-off-by: Miklos Szeredi (cherry picked from commit c9f00fdb9ab3999cb2fb582ad82a5db9e70c82f5) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index fdb63deff1a7..6d5d9fd06ec0 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -276,11 +276,11 @@ static int ovl_dir_mark_whiteouts(struct dentry *dir, return 0; } -static inline int ovl_dir_read_merged(struct path *upperpath, - struct path *lowerpath, - struct list_head *list) +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) { int err; + struct path lowerpath; + struct path upperpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, @@ -288,25 +288,28 @@ static inline int ovl_dir_read_merged(struct path *upperpath, .is_merge = false, }; - if (upperpath->dentry) { - err = ovl_dir_read(upperpath, &rdd); + ovl_path_lower(dentry, &lowerpath); + ovl_path_upper(dentry, &upperpath); + + if (upperpath.dentry) { + err = ovl_dir_read(&upperpath, &rdd); if (err) goto out; - if (lowerpath->dentry) { - err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd); + if (lowerpath.dentry) { + err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd); if (err) goto out; } } - if (lowerpath->dentry) { + if (lowerpath.dentry) { /* * Insert lowerpath entries before upperpath ones, this allows * offsets to be reasonably constant */ list_add(&rdd.middle, rdd.list); rdd.is_merge = true; - err = ovl_dir_read(lowerpath, &rdd); + err = ovl_dir_read(&lowerpath, &rdd); list_del(&rdd.middle); } out: @@ -331,8 +334,6 @@ static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; - struct path lowerpath; - struct path upperpath; struct ovl_dir_cache *cache; cache = ovl_dir_cache(dentry); @@ -349,10 +350,7 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); - ovl_path_lower(dentry, &lowerpath); - ovl_path_upper(dentry, &upperpath); - - res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries); + res = ovl_dir_read_merged(dentry, &cache->entries); if (res) { ovl_cache_free(&cache->entries); kfree(cache); @@ -540,14 +538,9 @@ const struct file_operations ovl_dir_operations = { int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; - struct path lowerpath; - struct path upperpath; struct ovl_cache_entry *p; - ovl_path_upper(dentry, &upperpath); - ovl_path_lower(dentry, &lowerpath); - - err = ovl_dir_read_merged(&upperpath, &lowerpath, list); + err = ovl_dir_read_merged(dentry, list); if (err) return err; From effb810d716f07d643385ba8206ab981a4b20dd9 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:02 +0100 Subject: [PATCH 157/236] ovl: ovl_dir_fsync() cleanup Check against !OVL_PATH_LOWER instead of OVL_PATH_MERGE. For a copied up directory the two are currently equivalent. Signed-off-by: Miklos Szeredi (cherry picked from commit 7676895f4736421ebafc48de5078e25ea69e88ee) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 6d5d9fd06ec0..c0205990a9f5 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -452,10 +452,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, /* * Need to check if we started out being a lower dir, but got copied up */ - if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { + if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) { struct inode *inode = file_inode(file); - realfile =lockless_dereference(od->upperfile); + realfile = lockless_dereference(od->upperfile); if (!realfile) { struct path upperpath; From 8f7458c9c66418c8b9af323ba29c8f023ffe559e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:42 +0100 Subject: [PATCH 158/236] ovl: check whiteout while reading directory Don't make a separate pass for checking whiteouts, since we can do it while reading the upper directory. This will make it easier to handle multiple layers. Signed-off-by: Miklos Szeredi (cherry picked from commit 49c21e1cacd74a8c83407c70ad860c994e606e25) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 77 +++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 49 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c0205990a9f5..b361719e61d2 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -40,6 +40,7 @@ struct ovl_readdir_data { struct rb_root root; struct list_head *list; struct list_head middle; + struct dentry *dir; int count; int err; }; @@ -126,6 +127,32 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, if (p == NULL) return -ENOMEM; + if (d_type == DT_CHR) { + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + kfree(p); + return -ENOMEM; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + dentry = lookup_one_len(name, rdd->dir, len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + revert_creds(old_cred); + put_cred(override_cred); + } + list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, &rdd->root); @@ -233,49 +260,6 @@ static void ovl_dir_reset(struct file *file) od->is_real = false; } -static int ovl_dir_mark_whiteouts(struct dentry *dir, - struct ovl_readdir_data *rdd) -{ - struct ovl_cache_entry *p; - struct dentry *dentry; - const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) { - ovl_cache_free(rdd->list); - return -ENOMEM; - } - - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); - - mutex_lock(&dir->d_inode->i_mutex); - list_for_each_entry(p, rdd->list, l_node) { - if (p->is_cursor) - continue; - - if (p->type != DT_CHR) - continue; - - dentry = lookup_one_len(p->name, dir, p->len); - if (IS_ERR(dentry)) - continue; - - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } - mutex_unlock(&dir->d_inode->i_mutex); - - revert_creds(old_cred); - put_cred(override_cred); - - return 0; -} - static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) { int err; @@ -292,15 +276,10 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) ovl_path_upper(dentry, &upperpath); if (upperpath.dentry) { + rdd.dir = upperpath.dentry; err = ovl_dir_read(&upperpath, &rdd); if (err) goto out; - - if (lowerpath.dentry) { - err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd); - if (err) - goto out; - } } if (lowerpath.dentry) { /* From 8688583b8896b61fb799a3e7a6272b8beb87ec9e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:42 +0100 Subject: [PATCH 159/236] ovl: make path-type a bitmap OVL_PATH_PURE_UPPER -> __OVL_PATH_UPPER | __OVL_PATH_PURE OVL_PATH_UPPER -> __OVL_PATH_UPPER OVL_PATH_MERGE -> __OVL_PATH_UPPER | __OVL_PATH_MERGE OVL_PATH_LOWER -> 0 Multiple R/O layers will allow __OVL_PATH_MERGE without __OVL_PATH_UPPER. Signed-off-by: Miklos Szeredi (cherry picked from commit 1afaba1ecb5299cdd0f69b5bad98b0185fe71e79) Signed-off-by: Alex Shi --- fs/overlayfs/copy_up.c | 4 ++-- fs/overlayfs/dir.c | 22 +++++++++++----------- fs/overlayfs/inode.c | 9 ++++++--- fs/overlayfs/overlayfs.h | 13 +++++++++---- fs/overlayfs/readdir.c | 10 +++++----- fs/overlayfs/super.c | 19 ++++++++----------- 6 files changed, 41 insertions(+), 36 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index ea10a8719107..a5bfd60f4f6f 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -385,7 +385,7 @@ int ovl_copy_up(struct dentry *dentry) struct kstat stat; enum ovl_path_type type = ovl_path_type(dentry); - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) break; next = dget(dentry); @@ -394,7 +394,7 @@ int ovl_copy_up(struct dentry *dentry) parent = dget_parent(next); type = ovl_path_type(parent); - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) break; dput(next); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 8ffc4b980f1b..ab50bd111feb 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -152,7 +152,7 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, * correct link count. nlink=1 seems to pacify 'find' and * other utilities. */ - if (type == OVL_PATH_MERGE) + if (OVL_TYPE_MERGE(type)) stat->nlink = 1; return 0; @@ -630,7 +630,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) goto out_drop_write; type = ovl_path_type(dentry); - if (type == OVL_PATH_PURE_UPPER) { + if (OVL_TYPE_PURE_UPPER(type)) { err = ovl_remove_upper(dentry, is_dir); } else { const struct cred *old_cred; @@ -712,7 +712,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, /* Don't copy up directory trees */ old_type = ovl_path_type(old); err = -EXDEV; - if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir) + if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir) goto out; if (new->d_inode) { @@ -725,25 +725,25 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, new_type = ovl_path_type(new); err = -EXDEV; - if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) + if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) goto out; err = 0; - if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { + if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) { if (ovl_dentry_lower(old)->d_inode == ovl_dentry_lower(new)->d_inode) goto out; } - if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) { + if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) { if (ovl_dentry_upper(old)->d_inode == ovl_dentry_upper(new)->d_inode) goto out; } } else { if (ovl_dentry_is_opaque(new)) - new_type = OVL_PATH_UPPER; + new_type = __OVL_PATH_UPPER; else - new_type = OVL_PATH_PURE_UPPER; + new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE; } err = ovl_want_write(old); @@ -763,8 +763,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, goto out_drop_write; } - old_opaque = old_type != OVL_PATH_PURE_UPPER; - new_opaque = new_type != OVL_PATH_PURE_UPPER; + old_opaque = !OVL_TYPE_PURE_UPPER(old_type); + new_opaque = !OVL_TYPE_PURE_UPPER(new_type); if (old_opaque || new_opaque) { err = -ENOMEM; @@ -787,7 +787,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, old_cred = override_creds(override_cred); } - if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { + if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { opaquedir = ovl_check_empty_and_clear(new); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 07d74b24913b..48492f1240ad 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -238,7 +238,10 @@ out: static bool ovl_need_xattr_filter(struct dentry *dentry, enum ovl_path_type type) { - return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode); + if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER) + return S_ISDIR(dentry->d_inode->i_mode); + else + return false; } ssize_t ovl_getxattr(struct dentry *dentry, const char *name, @@ -299,7 +302,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name) if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) goto out_drop_write; - if (type == OVL_PATH_LOWER) { + if (!OVL_TYPE_UPPER(type)) { err = vfs_getxattr(realpath.dentry, name, NULL, 0); if (err < 0) goto out_drop_write; @@ -321,7 +324,7 @@ out: static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, struct dentry *realdentry) { - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) return false; if (special_file(realdentry->d_inode->i_mode)) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 814bed33dd07..d39eaa8f5e47 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -12,12 +12,17 @@ struct ovl_entry; enum ovl_path_type { - OVL_PATH_PURE_UPPER, - OVL_PATH_UPPER, - OVL_PATH_MERGE, - OVL_PATH_LOWER, + __OVL_PATH_PURE = (1 << 0), + __OVL_PATH_UPPER = (1 << 1), + __OVL_PATH_MERGE = (1 << 2), }; +#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) +#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE) +#define OVL_TYPE_MERGE_OR_LOWER(type) \ + (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) + extern const char *ovl_opaque_xattr; static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b361719e61d2..fb18287e761c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -255,8 +255,8 @@ static void ovl_dir_reset(struct file *file) ovl_cache_put(od, dentry); od->cache = NULL; } - WARN_ON(!od->is_real && type != OVL_PATH_MERGE); - if (od->is_real && type == OVL_PATH_MERGE) + WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type)); + if (od->is_real && OVL_TYPE_MERGE(type)) od->is_real = false; } @@ -431,7 +431,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, /* * Need to check if we started out being a lower dir, but got copied up */ - if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) { + if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { struct inode *inode = file_inode(file); realfile = lockless_dereference(od->upperfile); @@ -497,8 +497,8 @@ static int ovl_dir_open(struct inode *inode, struct file *file) } INIT_LIST_HEAD(&od->cursor.l_node); od->realfile = realfile; - od->is_real = (type != OVL_PATH_MERGE); - od->is_upper = (type != OVL_PATH_LOWER); + od->is_real = !OVL_TYPE_MERGE(type); + od->is_upper = OVL_TYPE_UPPER(type); od->cursor.is_cursor = true; file->private_data = od; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f16d318b71f8..821719cc8537 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -64,22 +64,19 @@ const char *ovl_opaque_xattr = "trusted.overlay.opaque"; enum ovl_path_type ovl_path_type(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; + enum ovl_path_type type = 0; if (oe->__upperdentry) { + type = __OVL_PATH_UPPER; + if (oe->lowerdentry) { if (S_ISDIR(dentry->d_inode->i_mode)) - return OVL_PATH_MERGE; - else - return OVL_PATH_UPPER; - } else { - if (oe->opaque) - return OVL_PATH_UPPER; - else - return OVL_PATH_PURE_UPPER; + type |= __OVL_PATH_MERGE; + } else if (!oe->opaque) { + type |= __OVL_PATH_PURE; } - } else { - return OVL_PATH_LOWER; } + return type; } static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) @@ -101,7 +98,7 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) enum ovl_path_type type = ovl_path_type(dentry); - if (type == OVL_PATH_LOWER) + if (!OVL_TYPE_UPPER(type)) ovl_path_lower(dentry, path); else ovl_path_upper(dentry, path); From 046f00ecd2379495e451658be72234ded7bee154 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:43 +0100 Subject: [PATCH 160/236] ovl: dont replace opaque dir When removing an empty opaque directory, then it makes no sense to replace it with an exact replica of itself before removal. Signed-off-by: Miklos Szeredi (cherry picked from commit 263b4a0fee43f1239c4d6f3c3a62fb5a20d84f2e) Signed-off-by: Alex Shi --- fs/overlayfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index ab50bd111feb..dcae3ac5aa76 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -506,7 +506,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; - if (is_dir) { + if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { opaquedir = ovl_check_empty_and_clear(dentry); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) From d669aefff21050073a65d12d8711c7c8bbad65b9 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:43 +0100 Subject: [PATCH 161/236] ovl: add mutli-layer infrastructure Add multiple lower layers to 'struct ovl_fs' and 'struct ovl_entry'. ovl_entry will have an array of paths, instead of just the dentry. This allows a compact array containing just the layers which exist at current point in the tree (which is expected to be a small number for the majority of dentries). The number of layers is not limited by this infrastructure. Signed-off-by: Miklos Szeredi (cherry picked from commit dd662667e6d3e55b42798a6e6e7f37dddc639460) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 98 +++++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 34 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 821719cc8537..460d866b97a2 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -35,7 +35,8 @@ struct ovl_config { /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - struct vfsmount *lower_mnt; + unsigned numlower; + struct vfsmount **lower_mnt; struct dentry *workdir; long lower_namelen; /* pathnames of lower and upper dirs, for show_options */ @@ -47,7 +48,6 @@ struct ovl_dir_cache; /* private information held for every overlayfs dentry */ struct ovl_entry { struct dentry *__upperdentry; - struct dentry *lowerdentry; struct ovl_dir_cache *cache; union { struct { @@ -56,10 +56,16 @@ struct ovl_entry { }; struct rcu_head rcu; }; + unsigned numlower; + struct path lowerstack[]; }; const char *ovl_opaque_xattr = "trusted.overlay.opaque"; +static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe) +{ + return oe->numlower ? oe->lowerstack[0].dentry : NULL; +} enum ovl_path_type ovl_path_type(struct dentry *dentry) { @@ -69,7 +75,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) if (oe->__upperdentry) { type = __OVL_PATH_UPPER; - if (oe->lowerdentry) { + if (oe->numlower) { if (S_ISDIR(dentry->d_inode->i_mode)) type |= __OVL_PATH_MERGE; } else if (!oe->opaque) { @@ -117,7 +123,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - return oe->lowerdentry; + return __ovl_dentry_lower(oe); } struct dentry *ovl_dentry_real(struct dentry *dentry) @@ -127,7 +133,7 @@ struct dentry *ovl_dentry_real(struct dentry *dentry) realdentry = ovl_upperdentry_dereference(oe); if (!realdentry) - realdentry = oe->lowerdentry; + realdentry = __ovl_dentry_lower(oe); return realdentry; } @@ -140,7 +146,7 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) if (realdentry) { *is_upper = true; } else { - realdentry = oe->lowerdentry; + realdentry = __ovl_dentry_lower(oe); *is_upper = false; } return realdentry; @@ -162,11 +168,9 @@ void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) void ovl_path_lower(struct dentry *dentry, struct path *path) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *oe = dentry->d_fsdata; - path->mnt = ofs->lower_mnt; - path->dentry = oe->lowerdentry; + *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; } int ovl_want_write(struct dentry *dentry) @@ -258,8 +262,11 @@ static void ovl_dentry_release(struct dentry *dentry) struct ovl_entry *oe = dentry->d_fsdata; if (oe) { + unsigned int i; + dput(oe->__upperdentry); - dput(oe->lowerdentry); + for (i = 0; i < oe->numlower; i++) + dput(oe->lowerstack[i].dentry); kfree_rcu(oe, rcu); } } @@ -268,9 +275,15 @@ static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, }; -static struct ovl_entry *ovl_alloc_entry(void) +static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { - return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); + size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); + struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); + + if (oe) + oe->numlower = numlower; + + return oe; } static inline struct dentry *ovl_lookup_real(struct dentry *dir, @@ -297,19 +310,19 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, { struct ovl_entry *oe; struct dentry *upperdir; - struct dentry *lowerdir; + struct path lowerdir; struct dentry *upperdentry = NULL; struct dentry *lowerdentry = NULL; struct inode *inode = NULL; int err; err = -ENOMEM; - oe = ovl_alloc_entry(); + oe = ovl_alloc_entry(1); if (!oe) goto out; upperdir = ovl_dentry_upper(dentry->d_parent); - lowerdir = ovl_dentry_lower(dentry->d_parent); + ovl_path_lower(dentry->d_parent, &lowerdir); if (upperdir) { upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); @@ -317,7 +330,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(upperdentry)) goto out_put_dir; - if (lowerdir && upperdentry) { + if (lowerdir.dentry && upperdentry) { if (ovl_is_whiteout(upperdentry)) { dput(upperdentry); upperdentry = NULL; @@ -327,8 +340,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } } } - if (lowerdir && !oe->opaque) { - lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); + if (lowerdir.dentry && !oe->opaque) { + lowerdentry = ovl_lookup_real(lowerdir.dentry, &dentry->d_name); err = PTR_ERR(lowerdentry); if (IS_ERR(lowerdentry)) goto out_dput_upper; @@ -355,8 +368,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } oe->__upperdentry = upperdentry; - oe->lowerdentry = lowerdentry; - + if (lowerdentry) { + oe->lowerstack[0].dentry = lowerdentry; + oe->lowerstack[0].mnt = lowerdir.mnt; + } else { + oe->numlower = 0; + } dentry->d_fsdata = oe; d_add(dentry, inode); @@ -380,10 +397,12 @@ struct file *ovl_path_open(struct path *path, int flags) static void ovl_put_super(struct super_block *sb) { struct ovl_fs *ufs = sb->s_fs_info; + unsigned i; dput(ufs->workdir); mntput(ufs->upper_mnt); - mntput(ufs->lower_mnt); + for (i = 0; i < ufs->numlower; i++) + mntput(ufs->lower_mnt[i]); kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); @@ -641,6 +660,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct ovl_entry *oe; struct ovl_fs *ufs; struct kstatfs statfs; + struct vfsmount *mnt; + unsigned int i; int err; err = -ENOMEM; @@ -661,7 +682,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } err = -ENOMEM; - oe = ovl_alloc_entry(); + oe = ovl_alloc_entry(1); if (oe == NULL) goto out_free_config; @@ -727,12 +748,24 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_workpath; } - ufs->lower_mnt = clone_private_mount(&lowerpath); - err = PTR_ERR(ufs->lower_mnt); - if (IS_ERR(ufs->lower_mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); + ufs->lower_mnt = kcalloc(1, sizeof(struct vfsmount *), GFP_KERNEL); + if (ufs->lower_mnt == NULL) goto out_put_upper_mnt; + + mnt = clone_private_mount(&lowerpath); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_lower_mnt; } + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY; + + ufs->lower_mnt[0] = mnt; + ufs->numlower = 1; ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); err = PTR_ERR(ufs->workdir); @@ -742,12 +775,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_lower_mnt; } - /* - * Make lower_mnt R/O. That way fchmod/fchown on lower file - * will fail instead of modifying lower fs. - */ - ufs->lower_mnt->mnt_flags |= MNT_READONLY; - /* If the upper fs is r/o, we mark overlayfs r/o too */ if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) sb->s_flags |= MS_RDONLY; @@ -768,7 +795,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) path_put(&workpath); oe->__upperdentry = upperpath.dentry; - oe->lowerdentry = lowerpath.dentry; + oe->lowerstack[0].dentry = lowerpath.dentry; + oe->lowerstack[0].mnt = ufs->lower_mnt[0]; root_dentry->d_fsdata = oe; @@ -782,7 +810,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) out_put_workdir: dput(ufs->workdir); out_put_lower_mnt: - mntput(ufs->lower_mnt); + for (i = 0; i < ufs->numlower; i++) + mntput(ufs->lower_mnt[i]); + kfree(ufs->lower_mnt); out_put_upper_mnt: mntput(ufs->upper_mnt); out_put_workpath: From 8216b4cc31e60690afef76cbe1a310064f8727d7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:43 +0100 Subject: [PATCH 162/236] ovl: helper to iterate layers Add helper to iterate through all the layers, starting from the upper layer (if exists) and continuing down through the lower layers. Signed-off-by: Miklos Szeredi (cherry picked from commit 5ef88da56a77bfb3b9631f5e5775f3bff86b6219) Signed-off-by: Alex Shi --- fs/overlayfs/overlayfs.h | 1 + fs/overlayfs/super.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d39eaa8f5e47..d176b679f526 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -135,6 +135,7 @@ void ovl_dentry_version_inc(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 460d866b97a2..07e4c576e93e 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -305,6 +305,27 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, return dentry; } +/* + * Returns next layer in stack starting from top. + * Returns -1 if this is the last layer. + */ +int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + BUG_ON(idx < 0); + if (idx == 0) { + ovl_path_upper(dentry, path); + if (path->dentry) + return oe->numlower ? 1 : -1; + idx++; + } + BUG_ON(idx > oe->numlower); + *path = oe->lowerstack[idx - 1]; + + return (idx < oe->numlower) ? idx + 1 : -1; +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { From 3dbcd51370dba076be4dc8c45d2bf19db48201b8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:44 +0100 Subject: [PATCH 163/236] ovl: multi-layer readdir If multiple lower layers exist, merge them as well in readdir according to the same rules as merging upper with lower. I.e. take whiteouts and opaque directories into account on all but the lowers layer. Signed-off-by: Miklos Szeredi (cherry picked from commit 9d7459d834c28f55c82f1737f638a6c90e0c0e0f) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 39 +++++++++++++++++++-------------------- fs/overlayfs/super.c | 3 +++ 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index fb18287e761c..5ef051491afc 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -263,35 +263,34 @@ static void ovl_dir_reset(struct file *file) static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) { int err; - struct path lowerpath; - struct path upperpath; + struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, .root = RB_ROOT, .is_merge = false, }; + int idx, next; - ovl_path_lower(dentry, &lowerpath); - ovl_path_upper(dentry, &upperpath); + for (idx = 0; idx != -1; idx = next) { + next = ovl_path_next(idx, dentry, &realpath); - if (upperpath.dentry) { - rdd.dir = upperpath.dentry; - err = ovl_dir_read(&upperpath, &rdd); - if (err) - goto out; + if (next != -1) { + rdd.dir = realpath.dentry; + err = ovl_dir_read(&realpath, &rdd); + if (err) + break; + } else { + /* + * Insert lowest layer entries before upper ones, this + * allows offsets to be reasonably constant + */ + list_add(&rdd.middle, rdd.list); + rdd.is_merge = true; + err = ovl_dir_read(&realpath, &rdd); + list_del(&rdd.middle); + } } - if (lowerpath.dentry) { - /* - * Insert lowerpath entries before upperpath ones, this allows - * offsets to be reasonably constant - */ - list_add(&rdd.middle, rdd.list); - rdd.is_merge = true; - err = ovl_dir_read(&lowerpath, &rdd); - list_del(&rdd.middle); - } -out: return err; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 07e4c576e93e..c245043aa1b9 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -81,6 +81,9 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) } else if (!oe->opaque) { type |= __OVL_PATH_PURE; } + } else { + if (oe->numlower > 1) + type |= __OVL_PATH_MERGE; } return type; } From 1dabc18b8b2115693ec868cb84fda26052a8ee83 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:44 +0100 Subject: [PATCH 164/236] ovl: multi-layer lookup Look up dentry in all relevant layers. Signed-off-by: Miklos Szeredi (cherry picked from commit 3d3c6b89399a1b5e8a59ffbb8cb2a7797a9ef154) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 143 ++++++++++++++++++++++++++++--------------- 1 file changed, 94 insertions(+), 49 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index c245043aa1b9..f72b82fdc1e6 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -333,82 +333,127 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe; - struct dentry *upperdir; - struct path lowerdir; - struct dentry *upperdentry = NULL; - struct dentry *lowerdentry = NULL; + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct path *stack = NULL; + struct dentry *upperdir, *upperdentry = NULL; + unsigned int ctr = 0; struct inode *inode = NULL; + bool upperopaque = false; + struct dentry *this, *prev = NULL; + unsigned int i; int err; - err = -ENOMEM; - oe = ovl_alloc_entry(1); - if (!oe) - goto out; - - upperdir = ovl_dentry_upper(dentry->d_parent); - ovl_path_lower(dentry->d_parent, &lowerdir); - + upperdir = ovl_upperdentry_dereference(poe); if (upperdir) { - upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); - err = PTR_ERR(upperdentry); - if (IS_ERR(upperdentry)) - goto out_put_dir; + this = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(this); + if (IS_ERR(this)) + goto out; - if (lowerdir.dentry && upperdentry) { - if (ovl_is_whiteout(upperdentry)) { - dput(upperdentry); - upperdentry = NULL; - oe->opaque = true; - } else if (ovl_is_opaquedir(upperdentry)) { - oe->opaque = true; + /* + * If this is not the lowermost layer, check whiteout and opaque + * directory. + */ + if (poe->numlower && this) { + if (ovl_is_whiteout(this)) { + dput(this); + this = NULL; + upperopaque = true; + } else if (ovl_is_opaquedir(this)) { + upperopaque = true; } } - } - if (lowerdir.dentry && !oe->opaque) { - lowerdentry = ovl_lookup_real(lowerdir.dentry, &dentry->d_name); - err = PTR_ERR(lowerdentry); - if (IS_ERR(lowerdentry)) - goto out_dput_upper; + upperdentry = prev = this; } - if (lowerdentry && upperdentry && - (!S_ISDIR(upperdentry->d_inode->i_mode) || - !S_ISDIR(lowerdentry->d_inode->i_mode))) { - dput(lowerdentry); - lowerdentry = NULL; - oe->opaque = true; + if (!upperopaque && poe->numlower) { + err = -ENOMEM; + stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_put_upper; } - if (lowerdentry || upperdentry) { + for (i = 0; !upperopaque && i < poe->numlower; i++) { + bool opaque = false; + struct path lowerpath = poe->lowerstack[i]; + + opaque = false; + this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); + err = PTR_ERR(this); + if (IS_ERR(this)) + goto out_put; + if (!this) + continue; + + /* + * If this is not the lowermost layer, check whiteout and opaque + * directory. + */ + if (i < poe->numlower - 1) { + if (ovl_is_whiteout(this)) { + dput(this); + break; + } else if (ovl_is_opaquedir(this)) { + opaque = true; + } + } + /* + * If this is a non-directory then stop here. + * + * FIXME: check for opaqueness maybe better done in remove code. + */ + if (!S_ISDIR(this->d_inode->i_mode)) { + opaque = true; + } else if (prev && (!S_ISDIR(prev->d_inode->i_mode) || + !S_ISDIR(this->d_inode->i_mode))) { + if (prev == upperdentry) + upperopaque = true; + dput(this); + break; + } + stack[ctr].dentry = this; + stack[ctr].mnt = lowerpath.mnt; + ctr++; + prev = this; + if (opaque) + break; + } + + oe = ovl_alloc_entry(ctr); + err = -ENOMEM; + if (!oe) + goto out_put; + + if (upperdentry || ctr) { struct dentry *realdentry; - realdentry = upperdentry ? upperdentry : lowerdentry; + realdentry = upperdentry ? upperdentry : stack[0].dentry; + err = -ENOMEM; inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, oe); if (!inode) - goto out_dput; + goto out_free_oe; ovl_copyattr(realdentry->d_inode, inode); } + oe->opaque = upperopaque; oe->__upperdentry = upperdentry; - if (lowerdentry) { - oe->lowerstack[0].dentry = lowerdentry; - oe->lowerstack[0].mnt = lowerdir.mnt; - } else { - oe->numlower = 0; - } + memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); + kfree(stack); dentry->d_fsdata = oe; d_add(dentry, inode); return NULL; -out_dput: - dput(lowerdentry); -out_dput_upper: - dput(upperdentry); -out_put_dir: +out_free_oe: kfree(oe); +out_put: + for (i = 0; i < ctr; i++) + dput(stack[i].dentry); + kfree(stack); +out_put_upper: + dput(upperdentry); out: return ERR_PTR(err); } From 34d7a5b7b3a920178f662fa768bb724eb41be925 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:45 +0100 Subject: [PATCH 165/236] ovl: check whiteout on lowest layer as well Not checking whiteouts on lowest layer was an optimization (there's nothing to white out there), but it could result in inconsitent behavior when a layer previously used as upper/middle is later used as lowest. Signed-off-by: Miklos Szeredi (cherry picked from commit 3e01cee3b980f96463cb6f378ab05303a99903d9) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 79 +++++++++++++++++++++--------------------- fs/overlayfs/super.c | 29 ++++++---------- 2 files changed, 51 insertions(+), 57 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 5ef051491afc..2b8315b290a5 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -80,23 +80,50 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } -static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, +static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, + const char *name, int len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); p = kmalloc(size, GFP_KERNEL); - if (p) { - memcpy(p->name, name, len); - p->name[len] = '\0'; - p->len = len; - p->type = d_type; - p->ino = ino; - p->is_whiteout = false; - p->is_cursor = false; - } + if (!p) + return NULL; + memcpy(p->name, name, len); + p->name[len] = '\0'; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + p->is_cursor = false; + + if (d_type == DT_CHR) { + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + kfree(p); + return NULL; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + dentry = lookup_one_len(name, dir, len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + revert_creds(old_cred); + put_cred(override_cred); + } return p; } @@ -123,36 +150,10 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, return 0; } - p = ovl_cache_entry_new(name, len, ino, d_type); + p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type); if (p == NULL) return -ENOMEM; - if (d_type == DT_CHR) { - struct dentry *dentry; - const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) { - kfree(p); - return -ENOMEM; - } - - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); - - dentry = lookup_one_len(name, rdd->dir, len); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } - revert_creds(old_cred); - put_cred(override_cred); - } - list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, &rdd->root); @@ -170,7 +171,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { - p = ovl_cache_entry_new(name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else @@ -231,6 +232,7 @@ static inline int ovl_dir_read(struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); + rdd->dir = realpath->dentry; rdd->ctx.pos = 0; do { rdd->count = 0; @@ -276,7 +278,6 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) next = ovl_path_next(idx, dentry, &realpath); if (next != -1) { - rdd.dir = realpath.dentry; err = ovl_dir_read(&realpath, &rdd); if (err) break; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f72b82fdc1e6..5dbc6789fd5f 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -350,16 +350,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(this)) goto out; - /* - * If this is not the lowermost layer, check whiteout and opaque - * directory. - */ - if (poe->numlower && this) { + if (this) { if (ovl_is_whiteout(this)) { dput(this); this = NULL; upperopaque = true; - } else if (ovl_is_opaquedir(this)) { + } else if (poe->numlower && ovl_is_opaquedir(this)) { upperopaque = true; } } @@ -384,19 +380,16 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_put; if (!this) continue; - - /* - * If this is not the lowermost layer, check whiteout and opaque - * directory. - */ - if (i < poe->numlower - 1) { - if (ovl_is_whiteout(this)) { - dput(this); - break; - } else if (ovl_is_opaquedir(this)) { - opaque = true; - } + if (ovl_is_whiteout(this)) { + dput(this); + break; } + /* + * Only makes sense to check opaque dir if this is not the + * lowermost layer. + */ + if (i < poe->numlower - 1 && ovl_is_opaquedir(this)) + opaque = true; /* * If this is a non-directory then stop here. * From 5dfc4d41f976aa0916abe3298c71e585e78e5491 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:45 +0100 Subject: [PATCH 166/236] ovl: lookup ENAMETOOLONG on lower means ENOENT "Suppose you have in one of the lower layers a filesystem with ->lookup()-enforced upper limit on name length. Pretty much every local fs has one, but... they are not all equal. 255 characters is the common upper limit, but e.g. jffs2 stops at 254, minixfs upper limit is somewhere from 14 to 60, depending upon version, etc. You are doing a lookup for something that is present in upper layer, but happens to be too long for one of the lower layers. Too bad - ENAMETOOLONG for you..." Reported-by: Al Viro Signed-off-by: Miklos Szeredi (cherry picked from commit 09e10322b71716adf567d453889ef0871cf226b9) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5dbc6789fd5f..110c968dcb3b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -376,8 +376,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, opaque = false; this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); err = PTR_ERR(this); - if (IS_ERR(this)) + if (IS_ERR(this)) { + /* + * If it's positive, then treat ENAMETOOLONG as ENOENT. + */ + if (err == -ENAMETOOLONG && (upperdentry || ctr)) + continue; goto out_put; + } if (!this) continue; if (ovl_is_whiteout(this)) { From f3f4d3482d4f4e8edd9ef8a862676004a07c2e43 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:46 +0100 Subject: [PATCH 167/236] ovl: allow statfs if no upper layer Handle "no upper layer" case in statfs. Signed-off-by: Miklos Szeredi (cherry picked from commit 4ebc581828d5d0fe189ca06cef8b7a63cb4583d5) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 110c968dcb3b..cc7a0f3aa0dd 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -484,7 +484,7 @@ static void ovl_put_super(struct super_block *sb) * @buf: The struct kstatfs to fill in with stats * * Get the filesystem statistics. As writes always target the upper layer - * filesystem pass the statfs to the same filesystem. + * filesystem pass the statfs to the upper filesystem (if it exists) */ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -493,7 +493,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) struct path path; int err; - ovl_path_upper(root_dentry, &path); + ovl_path_real(root_dentry, &path); err = vfs_statfs(&path, buf); if (!err) { From ed3dcf4bb43b6d737b85ba134d25d1483fa12c8e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:48 +0100 Subject: [PATCH 168/236] ovl: mount: change order of initialization Move allocation of root entry above to where it's needed. Move initializations related to upperdir and workdir near each other. Signed-off-by: Miklos Szeredi (cherry picked from commit 3b7a9a249a93e68b7bb318de40e64d3b68ba1a6d) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 74 ++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index cc7a0f3aa0dd..a17702833dd0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -723,7 +723,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct path lowerpath; struct path upperpath; struct path workpath; - struct inode *root_inode; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; @@ -749,54 +748,49 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_free_config; } - err = -ENOMEM; - oe = ovl_alloc_entry(1); - if (oe == NULL) - goto out_free_config; - err = ovl_mount_dir(ufs->config.upperdir, &upperpath); if (err) - goto out_free_oe; - - err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); - if (err) - goto out_put_upperpath; + goto out_free_config; err = ovl_mount_dir(ufs->config.workdir, &workpath); if (err) - goto out_put_lowerpath; + goto out_put_upperpath; + + err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); + if (err) + goto out_put_workpath; err = -EINVAL; if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || !S_ISDIR(lowerpath.dentry->d_inode->i_mode) || !S_ISDIR(workpath.dentry->d_inode->i_mode)) { pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n"); - goto out_put_workpath; + goto out_put_lowerpath; } if (upperpath.mnt != workpath.mnt) { pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); - goto out_put_workpath; + goto out_put_lowerpath; } if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); - goto out_put_workpath; + goto out_put_lowerpath; } if (!ovl_is_allowed_fs_type(upperpath.dentry)) { pr_err("overlayfs: filesystem of upperdir is not supported\n"); - goto out_put_workpath; + goto out_put_lowerpath; } if (!ovl_is_allowed_fs_type(lowerpath.dentry)) { pr_err("overlayfs: filesystem of lowerdir is not supported\n"); - goto out_put_workpath; + goto out_put_lowerpath; } err = vfs_statfs(&lowerpath, &statfs); if (err) { pr_err("overlayfs: statfs failed on lowerpath\n"); - goto out_put_workpath; + goto out_put_lowerpath; } ufs->lower_namelen = statfs.f_namelen; @@ -806,19 +800,27 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("overlayfs: maximum fs stacking depth exceeded\n"); - goto out_put_workpath; + goto out_put_lowerpath; } ufs->upper_mnt = clone_private_mount(&upperpath); err = PTR_ERR(ufs->upper_mnt); if (IS_ERR(ufs->upper_mnt)) { pr_err("overlayfs: failed to clone upperpath\n"); - goto out_put_workpath; + goto out_put_lowerpath; + } + + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_err("overlayfs: failed to create directory %s/%s\n", + ufs->config.workdir, OVL_WORKDIR_NAME); + goto out_put_upper_mnt; } ufs->lower_mnt = kcalloc(1, sizeof(struct vfsmount *), GFP_KERNEL); if (ufs->lower_mnt == NULL) - goto out_put_upper_mnt; + goto out_put_workdir; mnt = clone_private_mount(&lowerpath); err = PTR_ERR(mnt); @@ -835,14 +837,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->lower_mnt[0] = mnt; ufs->numlower = 1; - ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); - err = PTR_ERR(ufs->workdir); - if (IS_ERR(ufs->workdir)) { - pr_err("overlayfs: failed to create directory %s/%s\n", - ufs->config.workdir, OVL_WORKDIR_NAME); - goto out_put_lower_mnt; - } - /* If the upper fs is r/o, we mark overlayfs r/o too */ if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) sb->s_flags |= MS_RDONLY; @@ -850,13 +844,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; - root_inode = ovl_new_inode(sb, S_IFDIR, oe); - if (!root_inode) - goto out_put_workdir; + oe = ovl_alloc_entry(1); + if (!oe) + goto out_put_lower_mnt; - root_dentry = d_make_root(root_inode); + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); if (!root_dentry) - goto out_put_workdir; + goto out_free_oe; mntput(upperpath.mnt); mntput(lowerpath.mnt); @@ -875,22 +869,22 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) return 0; -out_put_workdir: - dput(ufs->workdir); +out_free_oe: + kfree(oe); out_put_lower_mnt: for (i = 0; i < ufs->numlower; i++) mntput(ufs->lower_mnt[i]); kfree(ufs->lower_mnt); +out_put_workdir: + dput(ufs->workdir); out_put_upper_mnt: mntput(ufs->upper_mnt); -out_put_workpath: - path_put(&workpath); out_put_lowerpath: path_put(&lowerpath); +out_put_workpath: + path_put(&workpath); out_put_upperpath: path_put(&upperpath); -out_free_oe: - kfree(oe); out_free_config: kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); From 4db5f1d8cd9d89c8810c8ed10a4e50d1f0debbd1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:49 +0100 Subject: [PATCH 169/236] ovl: improve mount helpers Move common checks into ovl_mount_dir() helper. Create helper for looking up lower directories. Signed-off-by: Miklos Szeredi (cherry picked from commit ab508822cab4c84f07373cd6ad107a1fd1362831) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 135 +++++++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 57 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a17702833dd0..592370ff453a 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -669,24 +669,6 @@ static void ovl_unescape(char *s) } } -static int ovl_mount_dir(const char *name, struct path *path) -{ - int err; - char *tmp = kstrdup(name, GFP_KERNEL); - - if (!tmp) - return -ENOMEM; - - ovl_unescape(tmp); - err = kern_path(tmp, LOOKUP_FOLLOW, path); - if (err) { - pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err); - err = -EINVAL; - } - kfree(tmp); - return err; -} - static bool ovl_is_allowed_fs_type(struct dentry *root) { const struct dentry_operations *dop = root->d_op; @@ -706,6 +688,71 @@ static bool ovl_is_allowed_fs_type(struct dentry *root) return true; } +static int ovl_mount_dir_noesc(const char *name, struct path *path) +{ + int err; + + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + goto out; + } + err = -EINVAL; + if (!ovl_is_allowed_fs_type(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported\n", name); + goto out_put; + } + if (!S_ISDIR(path->dentry->d_inode->i_mode)) { + pr_err("overlayfs: '%s' not a directory\n", name); + goto out_put; + } + return 0; + +out_put: + path_put(path); +out: + return err; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err = -ENOMEM; + char *tmp = kstrdup(name, GFP_KERNEL); + + if (tmp) { + ovl_unescape(tmp); + err = ovl_mount_dir_noesc(tmp, path); + kfree(tmp); + } + return err; +} + +static int ovl_lower_dir(const char *name, struct path *path, long *namelen, + int *stack_depth) +{ + int err; + struct kstatfs statfs; + + err = ovl_mount_dir(name, path); + if (err) + goto out; + + err = vfs_statfs(path, &statfs); + if (err) { + pr_err("overlayfs: statfs failed on '%s'\n", name); + goto out_put; + } + *namelen = max(*namelen, statfs.f_namelen); + *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + + return 0; + +out_put: + path_put(path); +out: + return err; +} + /* Workdir should not be subdir of upperdir and vice versa */ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) { @@ -726,7 +773,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; - struct kstatfs statfs; struct vfsmount *mnt; unsigned int i; int err; @@ -756,48 +802,23 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_put_upperpath; - err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; + + err = ovl_lower_dir(ufs->config.lowerdir, &lowerpath, + &ufs->lower_namelen, &sb->s_stack_depth); if (err) goto out_put_workpath; err = -EINVAL; - if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || - !S_ISDIR(lowerpath.dentry->d_inode->i_mode) || - !S_ISDIR(workpath.dentry->d_inode->i_mode)) { - pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n"); - goto out_put_lowerpath; - } - - if (upperpath.mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); - goto out_put_lowerpath; - } - if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); - goto out_put_lowerpath; - } - - if (!ovl_is_allowed_fs_type(upperpath.dentry)) { - pr_err("overlayfs: filesystem of upperdir is not supported\n"); - goto out_put_lowerpath; - } - - if (!ovl_is_allowed_fs_type(lowerpath.dentry)) { - pr_err("overlayfs: filesystem of lowerdir is not supported\n"); - goto out_put_lowerpath; - } - - err = vfs_statfs(&lowerpath, &statfs); - if (err) { - pr_err("overlayfs: statfs failed on lowerpath\n"); - goto out_put_lowerpath; - } - ufs->lower_namelen = statfs.f_namelen; - - sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, - lowerpath.mnt->mnt_sb->s_stack_depth) + 1; - - err = -EINVAL; + sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("overlayfs: maximum fs stacking depth exceeded\n"); goto out_put_lowerpath; From b153140225155c1de019eefe5aadde93365f7a66 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:51 +0100 Subject: [PATCH 170/236] ovl: make upperdir optional Make "upperdir=" mount option optional. If "upperdir=" is not given, then the "workdir=" option is also optional (and ignored if given). Signed-off-by: Miklos Szeredi (cherry picked from commit 53a08cb9b8bccfe58f1228c7c27baf34a83da78b) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 83 +++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 592370ff453a..35bb0adf10cf 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -516,8 +516,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct ovl_fs *ufs = sb->s_fs_info; seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); - seq_printf(m, ",upperdir=%s", ufs->config.upperdir); - seq_printf(m, ",workdir=%s", ufs->config.workdir); + if (ufs->config.upperdir) { + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + seq_printf(m, ",workdir=%s", ufs->config.workdir); + } return 0; } @@ -768,8 +770,8 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path lowerpath; - struct path upperpath; - struct path workpath; + struct path upperpath = { NULL, NULL }; + struct path workpath = { NULL, NULL }; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; @@ -786,31 +788,38 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_free_config; - /* FIXME: workdir is not needed for a R/O mount */ err = -EINVAL; - if (!ufs->config.upperdir || !ufs->config.lowerdir || - !ufs->config.workdir) { - pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); + if (!ufs->config.lowerdir) { + pr_err("overlayfs: missing 'lowerdir'\n"); goto out_free_config; } - err = ovl_mount_dir(ufs->config.upperdir, &upperpath); - if (err) - goto out_free_config; + sb->s_stack_depth = 0; + if (ufs->config.upperdir) { + /* FIXME: workdir is not needed for a R/O mount */ + if (!ufs->config.workdir) { + pr_err("overlayfs: missing 'workdir'\n"); + goto out_free_config; + } - err = ovl_mount_dir(ufs->config.workdir, &workpath); - if (err) - goto out_put_upperpath; + err = ovl_mount_dir(ufs->config.upperdir, &upperpath); + if (err) + goto out_free_config; - if (upperpath.mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); - goto out_put_workpath; + err = ovl_mount_dir(ufs->config.workdir, &workpath); + if (err) + goto out_put_upperpath; + + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; } - if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); - goto out_put_workpath; - } - sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; err = ovl_lower_dir(ufs->config.lowerdir, &lowerpath, &ufs->lower_namelen, &sb->s_stack_depth); @@ -824,19 +833,21 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_lowerpath; } - ufs->upper_mnt = clone_private_mount(&upperpath); - err = PTR_ERR(ufs->upper_mnt); - if (IS_ERR(ufs->upper_mnt)) { - pr_err("overlayfs: failed to clone upperpath\n"); - goto out_put_lowerpath; - } + if (ufs->config.upperdir) { + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out_put_lowerpath; + } - ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); - err = PTR_ERR(ufs->workdir); - if (IS_ERR(ufs->workdir)) { - pr_err("overlayfs: failed to create directory %s/%s\n", - ufs->config.workdir, OVL_WORKDIR_NAME); - goto out_put_upper_mnt; + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_err("overlayfs: failed to create directory %s/%s\n", + ufs->config.workdir, OVL_WORKDIR_NAME); + goto out_put_upper_mnt; + } } ufs->lower_mnt = kcalloc(1, sizeof(struct vfsmount *), GFP_KERNEL); @@ -858,8 +869,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->lower_mnt[0] = mnt; ufs->numlower = 1; - /* If the upper fs is r/o, we mark overlayfs r/o too */ - if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) + /* If the upper fs is r/o or nonexistent, we mark overlayfs r/o too */ + if (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)) sb->s_flags |= MS_RDONLY; sb->s_d_op = &ovl_dentry_operations; From cb19636311dcb8206f668af9cbb68e9bafaeea14 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:52 +0100 Subject: [PATCH 171/236] ovl: support multiple lower layers Allow "lowerdir=" option to contain multiple lower directories separated by a colon (e.g. "lowerdir=/bin:/usr/bin"). Colon characters in filenames can be escaped with a backslash. Signed-off-by: Miklos Szeredi (cherry picked from commit a78d9f0d5d5ca9054703376c7c23c901807ddd87) Signed-off-by: Alex Shi --- Documentation/filesystems/overlayfs.txt | 12 +++ fs/overlayfs/super.c | 110 ++++++++++++++++++------ 2 files changed, 95 insertions(+), 27 deletions(-) diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index a27c950ece61..b37092886dcc 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -159,6 +159,18 @@ overlay filesystem (though an operation on the name of the file such as rename or unlink will of course be noticed and handled). +Multiple lower layers +--------------------- + +Multiple lower layers can now be given using the the colon (":") as a +separator character between the directory names. For example: + + mount -t overlay overlay -olowerdir=/lower1:/lower2:/lower3 /merged + +As the example shows, "upperdir=" and "workdir=" may be omitted. In that case +the overlay will be read-only. + + Non-standard behavior --------------------- diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 35bb0adf10cf..5c495a17a5a3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -60,6 +60,8 @@ struct ovl_entry { struct path lowerstack[]; }; +#define OVL_MAX_STACK 500 + const char *ovl_opaque_xattr = "trusted.overlay.opaque"; static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe) @@ -692,8 +694,12 @@ static bool ovl_is_allowed_fs_type(struct dentry *root) static int ovl_mount_dir_noesc(const char *name, struct path *path) { - int err; + int err = -EINVAL; + if (!*name) { + pr_err("overlayfs: empty lowerdir\n"); + goto out; + } err = kern_path(name, LOOKUP_FOLLOW, path); if (err) { pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); @@ -735,7 +741,7 @@ static int ovl_lower_dir(const char *name, struct path *path, long *namelen, int err; struct kstatfs statfs; - err = ovl_mount_dir(name, path); + err = ovl_mount_dir_noesc(name, path); if (err) goto out; @@ -767,15 +773,38 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) return ok; } +static unsigned int ovl_split_lowerdirs(char *str) +{ + unsigned int ctr = 1; + char *s, *d; + + for (s = d = str;; s++, d++) { + if (*s == '\\') { + s++; + } else if (*s == ':') { + *d = '\0'; + ctr++; + continue; + } + *d = *s; + if (!*s) + break; + } + return ctr; +} + static int ovl_fill_super(struct super_block *sb, void *data, int silent) { - struct path lowerpath; struct path upperpath = { NULL, NULL }; struct path workpath = { NULL, NULL }; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; - struct vfsmount *mnt; + struct path *stack = NULL; + char *lowertmp; + char *lower; + unsigned int numlower; + unsigned int stacklen = 0; unsigned int i; int err; @@ -820,12 +849,30 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; } - - err = ovl_lower_dir(ufs->config.lowerdir, &lowerpath, - &ufs->lower_namelen, &sb->s_stack_depth); - if (err) + err = -ENOMEM; + lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL); + if (!lowertmp) goto out_put_workpath; + err = -EINVAL; + stacklen = ovl_split_lowerdirs(lowertmp); + if (stacklen > OVL_MAX_STACK) + goto out_free_lowertmp; + + stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_free_lowertmp; + + lower = lowertmp; + for (numlower = 0; numlower < stacklen; numlower++) { + err = ovl_lower_dir(lower, &stack[numlower], + &ufs->lower_namelen, &sb->s_stack_depth); + if (err) + goto out_put_lowerpath; + + lower = strchr(lower, '\0') + 1; + } + err = -EINVAL; sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { @@ -850,24 +897,25 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } } - ufs->lower_mnt = kcalloc(1, sizeof(struct vfsmount *), GFP_KERNEL); + ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL); if (ufs->lower_mnt == NULL) goto out_put_workdir; + for (i = 0; i < numlower; i++) { + struct vfsmount *mnt = clone_private_mount(&stack[i]); - mnt = clone_private_mount(&lowerpath); - err = PTR_ERR(mnt); - if (IS_ERR(mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); - goto out_put_lower_mnt; + if (IS_ERR(mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_lower_mnt; + } + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY; + + ufs->lower_mnt[ufs->numlower] = mnt; + ufs->numlower++; } - /* - * Make lower_mnt R/O. That way fchmod/fchown on lower file - * will fail instead of modifying lower fs. - */ - mnt->mnt_flags |= MNT_READONLY; - - ufs->lower_mnt[0] = mnt; - ufs->numlower = 1; /* If the upper fs is r/o or nonexistent, we mark overlayfs r/o too */ if (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)) @@ -876,7 +924,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; - oe = ovl_alloc_entry(1); + oe = ovl_alloc_entry(numlower); if (!oe) goto out_put_lower_mnt; @@ -885,12 +933,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_free_oe; mntput(upperpath.mnt); - mntput(lowerpath.mnt); + for (i = 0; i < numlower; i++) + mntput(stack[i].mnt); path_put(&workpath); + kfree(lowertmp); oe->__upperdentry = upperpath.dentry; - oe->lowerstack[0].dentry = lowerpath.dentry; - oe->lowerstack[0].mnt = ufs->lower_mnt[0]; + for (i = 0; i < numlower; i++) { + oe->lowerstack[i].dentry = stack[i].dentry; + oe->lowerstack[i].mnt = ufs->lower_mnt[i]; + } root_dentry->d_fsdata = oe; @@ -912,7 +964,11 @@ out_put_workdir: out_put_upper_mnt: mntput(ufs->upper_mnt); out_put_lowerpath: - path_put(&lowerpath); + for (i = 0; i < numlower; i++) + path_put(&stack[i]); + kfree(stack); +out_free_lowertmp: + kfree(lowertmp); out_put_workpath: path_put(&workpath); out_put_upperpath: From 96149453dee0b67c9ecb1032c0ed0e4aa31fb914 Mon Sep 17 00:00:00 2001 From: hujianyang Date: Wed, 26 Nov 2014 16:16:59 +0800 Subject: [PATCH 172/236] ovl: Cleanup redundant blank lines This patch removes redundant blanks lines in overlayfs. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi (cherry picked from commit 1ba38725a351f91769918b132c17fb7fcaf6c2f5) Signed-off-by: Alex Shi --- fs/overlayfs/copy_up.c | 1 - fs/overlayfs/inode.c | 1 - fs/overlayfs/super.c | 1 - 3 files changed, 3 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index a5bfd60f4f6f..24f640441bd9 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -191,7 +191,6 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) ovl_set_timestamps(upperdentry, stat); return err; - } static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 48492f1240ad..5ac1236afb15 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -433,5 +433,4 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, } return inode; - } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5c495a17a5a3..e9ce4a9e8749 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -106,7 +106,6 @@ void ovl_path_upper(struct dentry *dentry, struct path *path) enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) { - enum ovl_path_type type = ovl_path_type(dentry); if (!OVL_TYPE_UPPER(type)) From 4fb1e9e207167f999c57954716a51942d0d61674 Mon Sep 17 00:00:00 2001 From: hujianyang Date: Mon, 24 Nov 2014 18:25:21 +0800 Subject: [PATCH 173/236] ovl: Use macros to present ovl_xattr This patch adds two macros: OVL_XATTR_PRE_NAME and OVL_XATTR_PRE_LEN to present ovl_xattr name prefix and its length. Also, a new macro OVL_XATTR_OPAQUE is introduced to replace old *ovl_opaque_xattr*. Fix the length of "trusted.overlay." to *16*. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi (cherry picked from commit cead89bb08c0f64e23886f1c18df9bb98e97c55c) Signed-off-by: Alex Shi --- fs/overlayfs/dir.c | 4 ++-- fs/overlayfs/inode.c | 2 +- fs/overlayfs/overlayfs.h | 4 +++- fs/overlayfs/super.c | 4 +--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index dcae3ac5aa76..0dc4c33a0a1b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -118,14 +118,14 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, static int ovl_set_opaque(struct dentry *upperdentry) { - return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); + return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); } static void ovl_remove_opaque(struct dentry *upperdentry) { int err; - err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr); + err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE); if (err) { pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", upperdentry->d_name.name, err); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 5ac1236afb15..04f124884687 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -205,7 +205,7 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) static bool ovl_is_private_xattr(const char *name) { - return strncmp(name, "trusted.overlay.", 14) == 0; + return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; } int ovl_setxattr(struct dentry *dentry, const char *name, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d176b679f526..17ac5afc9ffb 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -23,7 +23,9 @@ enum ovl_path_type { #define OVL_TYPE_MERGE_OR_LOWER(type) \ (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) -extern const char *ovl_opaque_xattr; +#define OVL_XATTR_PRE_NAME "trusted.overlay." +#define OVL_XATTR_PRE_LEN 16 +#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque" static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e9ce4a9e8749..84f3144e1b33 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -62,8 +62,6 @@ struct ovl_entry { #define OVL_MAX_STACK 500 -const char *ovl_opaque_xattr = "trusted.overlay.opaque"; - static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe) { return oe->numlower ? oe->lowerstack[0].dentry : NULL; @@ -254,7 +252,7 @@ static bool ovl_is_opaquedir(struct dentry *dentry) if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) return false; - res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1); + res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); if (res == 1 && val == 'y') return true; From 248653d3bb85e5b87899b47e56dc0cf86ee24f89 Mon Sep 17 00:00:00 2001 From: hujianyang Date: Tue, 6 Jan 2015 12:52:13 +0800 Subject: [PATCH 174/236] ovl: Fix kernel panic while mounting overlayfs The function ovl_fill_super() in recently multi-layer support version will incorrectly return 0 at error handling path and then cause kernel panic. This failure can be reproduced by mounting a overlayfs with upperdir and workdir in different mounts. And also, If the memory allocation of *lower_mnt* fail, this function may return an zero either. This patch fix this problem by setting *err* to proper error number before jumping to error handling path. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi (cherry picked from commit 2f83fd8c2849a388082f30d755a75c1e67c4643b) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 84f3144e1b33..6ca8ea8cb9b2 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -836,6 +836,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_put_upperpath; + err = -EINVAL; if (upperpath.mnt != workpath.mnt) { pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); goto out_put_workpath; @@ -894,12 +895,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } } + err = -ENOMEM; ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL); if (ufs->lower_mnt == NULL) goto out_put_workdir; for (i = 0; i < numlower; i++) { struct vfsmount *mnt = clone_private_mount(&stack[i]); + err = PTR_ERR(mnt); if (IS_ERR(mnt)) { pr_err("overlayfs: failed to clone lowerpath\n"); goto out_put_lower_mnt; From ef2578e619a9107cb007644f38701d6774a46c5f Mon Sep 17 00:00:00 2001 From: hujianyang Date: Tue, 6 Jan 2015 16:10:01 +0800 Subject: [PATCH 175/236] ovl: Fix opaque regression in ovl_lookup Current multi-layer support overlayfs has a regression in .lookup(). If there is a directory in upperdir and a regular file has same name in lowerdir in a merged directory, lower file is hidden and upper directory is set to opaque in former case. But it is changed in present code. In lowerdir lookup path, if a found inode is not directory, the type checking of previous inode is missing. This inode will be copied to the lowerstack of ovl_entry directly. That will lead to several wrong conditions, for example, the reading of the directory in upperdir may return an error like: ls: reading directory .: Not a directory This patch makes the lowerdir lookup path check the opaque for non-directory file too. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi (cherry picked from commit a425c037f3dd8a56469158ab5f37beb46402d958) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 6ca8ea8cb9b2..9e94f4acef1a 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -372,7 +372,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, bool opaque = false; struct path lowerpath = poe->lowerstack[i]; - opaque = false; this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) { @@ -395,20 +394,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, */ if (i < poe->numlower - 1 && ovl_is_opaquedir(this)) opaque = true; - /* - * If this is a non-directory then stop here. - * - * FIXME: check for opaqueness maybe better done in remove code. - */ - if (!S_ISDIR(this->d_inode->i_mode)) { - opaque = true; - } else if (prev && (!S_ISDIR(prev->d_inode->i_mode) || - !S_ISDIR(this->d_inode->i_mode))) { + + if (prev && (!S_ISDIR(prev->d_inode->i_mode) || + !S_ISDIR(this->d_inode->i_mode))) { + /* + * FIXME: check for upper-opaqueness maybe better done + * in remove code. + */ if (prev == upperdentry) upperopaque = true; dput(this); break; } + /* + * If this is a non-directory then stop here. + */ + if (!S_ISDIR(this->d_inode->i_mode)) + opaque = true; + stack[ctr].dentry = this; stack[ctr].mnt = lowerpath.mnt; ctr++; From 1676d3ec23db0db95de768672df3355c92589a0e Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Sat, 3 Jan 2015 02:26:49 +0900 Subject: [PATCH 176/236] ovl: Prevent rw remount when it should be ro mount Overlayfs should be mounted read-only when upper-fs is read-only or nonexistent. But now it can be remounted read-write and this can cause kernel panic. So we should prevent read-write remount when the above situation happens. Signed-off-by: Seunghun Lee Signed-off-by: Miklos Szeredi (cherry picked from commit 3cdf6fe91041b3afd6761f76254f7b6cbe8020fc) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 9e94f4acef1a..b90952f528b1 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -525,10 +525,22 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) return 0; } +static int ovl_remount(struct super_block *sb, int *flags, char *data) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + if (!(*flags & MS_RDONLY) && + (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY))) + return -EROFS; + + return 0; +} + static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, .statfs = ovl_statfs, .show_options = ovl_show_options, + .remount_fs = ovl_remount, }; enum { From add5fbd4b119cd897c130cc2282c4040dc004b88 Mon Sep 17 00:00:00 2001 From: hujianyang Date: Thu, 11 Dec 2014 10:30:18 +0800 Subject: [PATCH 177/236] ovl: discard independent cursor in readdir() Since the ovl_dir_cache is stable during a directory reading, the cursor of struct ovl_dir_file don't need to be an independent entry in the list of a merged directory. This patch changes *cursor* to a pointer which points to the entry in the ovl_dir_cache. After this, we don't need to check *is_cursor* either. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi (cherry picked from commit 4330397e4e8a662f36d101659e2a59ce32e76ff4) Signed-off-by: Alex Shi --- fs/overlayfs/readdir.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 2b8315b290a5..907870e81a72 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -24,7 +24,6 @@ struct ovl_cache_entry { struct list_head l_node; struct rb_node node; bool is_whiteout; - bool is_cursor; char name[]; }; @@ -49,7 +48,7 @@ struct ovl_dir_file { bool is_real; bool is_upper; struct ovl_dir_cache *cache; - struct ovl_cache_entry cursor; + struct list_head *cursor; struct file *realfile; struct file *upperfile; }; @@ -97,7 +96,6 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, p->type = d_type; p->ino = ino; p->is_whiteout = false; - p->is_cursor = false; if (d_type == DT_CHR) { struct dentry *dentry; @@ -196,7 +194,6 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) { struct ovl_dir_cache *cache = od->cache; - list_del_init(&od->cursor.l_node); WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { @@ -256,6 +253,7 @@ static void ovl_dir_reset(struct file *file) if (cache && ovl_dentry_version_get(dentry) != cache->version) { ovl_cache_put(od, dentry); od->cache = NULL; + od->cursor = NULL; } WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type)); if (od->is_real && OVL_TYPE_MERGE(type)) @@ -297,17 +295,16 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) { - struct ovl_cache_entry *p; + struct list_head *p; loff_t off = 0; - list_for_each_entry(p, &od->cache->entries, l_node) { - if (p->is_cursor) - continue; + list_for_each(p, &od->cache->entries) { if (off >= pos) break; off++; } - list_move_tail(&od->cursor.l_node, &p->l_node); + /* Cursor is safe since the cache is stable */ + od->cursor = p; } static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) @@ -346,6 +343,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; + struct ovl_cache_entry *p; if (!ctx->pos) ovl_dir_reset(file); @@ -364,19 +362,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) ovl_seek_cursor(od, ctx->pos); } - while (od->cursor.l_node.next != &od->cache->entries) { - struct ovl_cache_entry *p; - - p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node); - /* Skip cursors */ - if (!p->is_cursor) { - if (!p->is_whiteout) { - if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) - break; - } - ctx->pos++; - } - list_move(&od->cursor.l_node, &p->l_node); + while (od->cursor != &od->cache->entries) { + p = list_entry(od->cursor, struct ovl_cache_entry, l_node); + if (!p->is_whiteout) + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + od->cursor = p->l_node.next; + ctx->pos++; } return 0; } @@ -495,11 +487,9 @@ static int ovl_dir_open(struct inode *inode, struct file *file) kfree(od); return PTR_ERR(realfile); } - INIT_LIST_HEAD(&od->cursor.l_node); od->realfile = realfile; od->is_real = !OVL_TYPE_MERGE(type); od->is_upper = OVL_TYPE_UPPER(type); - od->cursor.is_cursor = true; file->private_data = od; return 0; From 088805e56746eafd55a13e2dc7fb37d046cfdfdb Mon Sep 17 00:00:00 2001 From: hujianyang Date: Thu, 15 Jan 2015 13:17:36 +0800 Subject: [PATCH 178/236] ovl: print error message for invalid mount options Overlayfs should print an error message if an incorrect mount option is caught like other filesystems. After this patch, improper option input could be clearly known. Reported-by: Fabian Sturm Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi (cherry picked from commit bead55ef775f6e25a8d286c0d47030580f577bec) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b90952f528b1..ab3c8cb8e52d 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -615,6 +615,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) break; default: + pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; } } From 0d8e803097108c61b26175178e62465a7d36c889 Mon Sep 17 00:00:00 2001 From: hujianyang Date: Thu, 15 Jan 2015 13:19:21 +0800 Subject: [PATCH 179/236] ovl: check lowerdir amount for non-upper mount Recently multi-lower layer mount support allow upperdir and workdir to be omitted, then cause overlayfs can be mount with only one lowerdir directory. This action make no sense and have potential risk. This patch check the total number of lower directories to prevent mounting overlayfs with only one directory. Also, an error message is added to indicate lower directories exceed OVL_MAX_STACK limit. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi (cherry picked from commit 6be4506e34cf6075a1307b646e0a6c46c1c9010d) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ab3c8cb8e52d..edbb3ebcdaad 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -870,8 +870,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); - if (stacklen > OVL_MAX_STACK) + if (stacklen > OVL_MAX_STACK) { + pr_err("overlayfs: too many lower directries, limit is %d\n", + OVL_MAX_STACK); goto out_free_lowertmp; + } else if (!ufs->config.upperdir && stacklen == 1) { + pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + goto out_free_lowertmp; + } stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); if (!stack) From d7170c004481f9fbe303a391774e0fcb238f0e78 Mon Sep 17 00:00:00 2001 From: hujianyang Date: Thu, 15 Jan 2015 13:20:57 +0800 Subject: [PATCH 180/236] ovl: upper fs should not be R/O After importing multi-lower layer support, users could mount a r/o partition as the left most lowerdir instead of using it as upperdir. And a r/o upperdir may cause an error like overlayfs: failed to create directory ./workdir/work during mount. This patch check the *s_flags* of upper fs and return an error if it is a r/o partition. The checking of *upper_mnt->mnt_sb->s_flags* can be removed now. This patch also remove /* FIXME: workdir is not needed for a R/O mount */ from ovl_fill_super() because: 1) for upper fs r/o case Setting a r/o partition as upper is prevented, no need to care about workdir in this case. 2) for "mount overlay -o ro" with a r/w upper fs case Users could remount overlayfs to r/w in this case, so workdir should not be omitted. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi (cherry picked from commit 71cbad7e694ee81233b3be3a38b81c3d5872cc6f) Signed-off-by: Alex Shi --- fs/overlayfs/super.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index edbb3ebcdaad..5f0d1993e6e3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -529,8 +529,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ufs = sb->s_fs_info; - if (!(*flags & MS_RDONLY) && - (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY))) + if (!(*flags & MS_RDONLY) && !ufs->upper_mnt) return -EROFS; return 0; @@ -619,6 +618,15 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) return -EINVAL; } } + + /* Workdir is useless in non-upper mount */ + if (!config->upperdir && config->workdir) { + pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + return 0; } @@ -838,7 +846,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; if (ufs->config.upperdir) { - /* FIXME: workdir is not needed for a R/O mount */ if (!ufs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); goto out_free_config; @@ -848,6 +855,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_free_config; + /* Upper fs should not be r/o */ + if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out_put_upperpath; + } + err = ovl_mount_dir(ufs->config.workdir, &workpath); if (err) goto out_put_upperpath; @@ -939,8 +953,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->numlower++; } - /* If the upper fs is r/o or nonexistent, we mark overlayfs r/o too */ - if (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)) + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; sb->s_d_op = &ovl_dentry_operations; From 59b077027a431f849a07bd330f0de14c31376743 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 13 Dec 2014 00:59:53 +0100 Subject: [PATCH 181/236] ovl: add testsuite to docs Reported-by: Sedat Dilek Signed-off-by: Miklos Szeredi (cherry picked from commit 2b7a8f36f092a7855f6438cd42d6990394f450fa) Signed-off-by: Alex Shi --- Documentation/filesystems/overlayfs.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index b37092886dcc..006ea480506d 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -208,3 +208,15 @@ Changes to the underlying filesystems while part of a mounted overlay filesystem are not allowed. If the underlying filesystem is changed, the behavior of the overlay is undefined, though it will not result in a crash or deadlock. + +Testsuite +--------- + +There's testsuite developed by David Howells at: + + git://git.infradead.org/users/dhowells/unionmount-testsuite.git + +Run as root: + + # cd unionmount-testsuite + # ./run --ov From 6fb17a412207343151aac1fde91935a542339deb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 8 Jan 2015 15:09:15 +0100 Subject: [PATCH 182/236] ovl: document lower layer ordering Reported-by: Fabian Sturm Signed-off-by: Miklos Szeredi (cherry picked from commit 6d900f5a33393067e370736d39798f814f5e25cc) Signed-off-by: Alex Shi --- Documentation/filesystems/overlayfs.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 006ea480506d..6db0e5d1da07 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -167,8 +167,12 @@ separator character between the directory names. For example: mount -t overlay overlay -olowerdir=/lower1:/lower2:/lower3 /merged -As the example shows, "upperdir=" and "workdir=" may be omitted. In that case -the overlay will be read-only. +As the example shows, "upperdir=" and "workdir=" may be omitted. In +that case the overlay will be read-only. + +The specified lower directories will be stacked beginning from the +rightmost one and going left. In the above example lower1 will be the +top, lower2 the middle and lower3 the bottom layer. Non-standard behavior From f858097eb8433b0c28ceaca8e992b338fb392772 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 26 Mar 2015 17:14:55 +0800 Subject: [PATCH 183/236] ALSA: hda - Add one more node in the EAPD supporting candidate list commit af95b41426e0b58279f8ff0ebe420df49a4e96b8 upstream. We have a HP machine which use the codec node 0x17 connecting the internal speaker, and from the node capability, we saw the EAPD, if we don't set the EAPD on for this node, the internal speaker can't output any sound. BugLink: https://bugs.launchpad.net/bugs/1436745 Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 910f2dbe1b24..94615072e373 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -271,7 +271,7 @@ static void alc_auto_setup_eapd(struct hda_codec *codec, bool on) { /* We currently only handle front, HP */ static hda_nid_t pins[] = { - 0x0f, 0x10, 0x14, 0x15, 0 + 0x0f, 0x10, 0x14, 0x15, 0x17, 0 }; hda_nid_t *p; for (p = pins; *p; p++) From 6a8bc76409acd4bc64250f43f974d4e6463a39fa Mon Sep 17 00:00:00 2001 From: "Dmitry M. Fedin" Date: Thu, 9 Apr 2015 17:37:03 +0300 Subject: [PATCH 184/236] ALSA: usb - Creative USB X-Fi Pro SB1095 volume knob support commit 3dc8523fa7412e731441c01fb33f003eb3cfece1 upstream. Adds an entry for Creative USB X-Fi to the rc_config array in mixer_quirks.c to allow use of volume knob on the device. Adds support for newer X-Fi Pro card, known as "Model No. SB1095" with USB ID "041e:3237" Signed-off-by: Dmitry M. Fedin Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/mixer_quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index 5a723df670b4..a82ec53b8fb3 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -178,6 +178,7 @@ static const struct rc_config { { USB_ID(0x041e, 0x3040), 2, 2, 6, 6, 2, 0x6e91 }, /* Live! 24-bit */ { USB_ID(0x041e, 0x3042), 0, 1, 1, 1, 1, 0x000d }, /* Usb X-Fi S51 */ { USB_ID(0x041e, 0x30df), 0, 1, 1, 1, 1, 0x000d }, /* Usb X-Fi S51 Pro */ + { USB_ID(0x041e, 0x3237), 0, 1, 1, 1, 1, 0x000d }, /* Usb X-Fi S51 Pro */ { USB_ID(0x041e, 0x3048), 2, 2, 6, 6, 2, 0x6e91 }, /* Toshiba SB0500 */ }; From 328927c7aaf8d7743ccd1f0df514155ff46af694 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Wed, 8 Apr 2015 16:34:00 +0800 Subject: [PATCH 185/236] ALSA: hda/realtek - Make more stable to get pin sense for ALC283 commit a59d7199f62b8336570972dcc288321d0ec999fe upstream. Pin sense will active when power pin is wake up. Power pin will not wake up immediately during resume state. Add some delay to wait for power pin activated. Signed-off-by: Kailang Yang Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 94615072e373..82b8a87b8614 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2885,6 +2885,8 @@ static void alc283_init(struct hda_codec *codec) if (!hp_pin) return; + + msleep(30); hp_pin_sense = snd_hda_jack_detect(codec, hp_pin); /* Index 0x43 Direct Drive HP AMP LPM Control 1 */ From 613adacd18f72858ccf92aed9ce0735b14dc25c5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 8 Apr 2015 20:47:55 +0200 Subject: [PATCH 186/236] ALSA: hda - Fix headphone pin config for Lifebook T731 commit cc7016ab1a22fb26f388c2fb2b692b89897cbc3e upstream. Some BIOS version of Fujitsu Lifebook T731 seems to set up the headphone pin (0x21) without the assoc number 0x0f while it's set only to the output on the docking port (0x1a). With the recent commit [03ad6a8c93b6: ALSA: hda - Fix "PCM" name being used on one DAC when there are two DACs], this resulted in the weird mixer element mapping where the headphone on the laptop is assigned as a shared volume with the speaker and the docking port is assigned as an individual headphone. This patch improves the situation by correcting the headphone pin config to the more appropriate value. Reported-and-tested-by: Taylor Smock Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 82b8a87b8614..ca26373ebe70 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -3953,6 +3953,7 @@ enum { ALC269_FIXUP_QUANTA_MUTE, ALC269_FIXUP_LIFEBOOK, ALC269_FIXUP_LIFEBOOK_EXTMIC, + ALC269_FIXUP_LIFEBOOK_HP_PIN, ALC269_FIXUP_AMIC, ALC269_FIXUP_DMIC, ALC269VB_FIXUP_AMIC, @@ -4087,6 +4088,13 @@ static const struct hda_fixup alc269_fixups[] = { { } }, }, + [ALC269_FIXUP_LIFEBOOK_HP_PIN] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x21, 0x0221102f }, /* HP out */ + { } + }, + }, [ALC269_FIXUP_AMIC] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -4540,6 +4548,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x104d, 0x9084, "Sony VAIO", ALC275_FIXUP_SONY_HWEQ), SND_PCI_QUIRK_VENDOR(0x104d, "Sony VAIO", ALC269_FIXUP_SONY_VAIO), SND_PCI_QUIRK(0x10cf, 0x1475, "Lifebook", ALC269_FIXUP_LIFEBOOK), + SND_PCI_QUIRK(0x10cf, 0x15dc, "Lifebook T731", ALC269_FIXUP_LIFEBOOK_HP_PIN), SND_PCI_QUIRK(0x10cf, 0x1845, "Lifebook U904", ALC269_FIXUP_LIFEBOOK_EXTMIC), SND_PCI_QUIRK(0x17aa, 0x20f2, "Thinkpad SL410/510", ALC269_FIXUP_SKU_IGNORE), SND_PCI_QUIRK(0x17aa, 0x215e, "Thinkpad L512", ALC269_FIXUP_SKU_IGNORE), From 2a4188fdc3846b89f2ee85fcd64ca708d9fa96be Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 26 Feb 2015 09:55:03 +0100 Subject: [PATCH 187/236] PCI/AER: Avoid info leak in __print_tlp_header() commit a1b7f2f6367944d445c6853035830a35c6343939 upstream. Commit fab4c256a58b ("PCI/AER: Add a TLP header print helper") introduced the helper function __print_tlp_header(), but contrary to the intention, the behaviour did change: Since we're taking the address of the parameter t, the first 4 or 8 bytes printed will be the value of the pointer t itself, and the remaining 12 or 8 bytes will be who-knows-what (something from the stack). We want to show the values of the four members of the struct aer_header_log_regs; that can be done without ugly and error-prone casts. On little-endian this should produce the same output as originally intended, and since no-one has complained about getting garbage output so far, I think big-endian should be ok too. Fixes: fab4c256a58b ("PCI/AER: Add a TLP header print helper") Signed-off-by: Rasmus Villemoes Signed-off-by: Bjorn Helgaas Acked-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pcie/aer/aerdrv_errprint.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c index 34ff7026440c..5d3b45640181 100644 --- a/drivers/pci/pcie/aer/aerdrv_errprint.c +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c @@ -127,16 +127,8 @@ static const char *aer_agent_string[] = { static void __print_tlp_header(struct pci_dev *dev, struct aer_header_log_regs *t) { - unsigned char *tlp = (unsigned char *)&t; - - dev_err(&dev->dev, " TLP Header:" - " %02x%02x%02x%02x %02x%02x%02x%02x" - " %02x%02x%02x%02x %02x%02x%02x%02x\n", - *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp, - *(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4), - *(tlp + 11), *(tlp + 10), *(tlp + 9), - *(tlp + 8), *(tlp + 15), *(tlp + 14), - *(tlp + 13), *(tlp + 12)); + dev_err(&dev->dev, " TLP Header: %08x %08x %08x %08x\n", + t->dw0, t->dw1, t->dw2, t->dw3); } static void __aer_print_error(struct pci_dev *dev, From 4188b15dbacf9d21b3308d3c5d798b20a459e9a7 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 26 Mar 2015 09:25:44 +0530 Subject: [PATCH 188/236] ARC: SA_SIGINFO ucontext regs off-by-one commit 6914e1e3f63caa829431160f0f7093292daef2d5 upstream. The regfile provided to SA_SIGINFO signal handler as ucontext was off by one due to pt_regs gutter cleanups in 2013. Before handling signal, user pt_regs are copied onto user_regs_struct and copied back later. Both structs are binary compatible. This was all fine until commit 2fa919045b72 (ARC: pt_regs update #2) which removed the empty stack slot at top of pt_regs (corresponding to first pad) and made the corresponding fixup in struct user_regs_struct (the pad in there was moved out of @scratch - not removed altogether as it is part of ptrace ABI) struct user_regs_struct { + long pad; struct { - long pad; long bta, lp_start, lp_end,.... } scratch; ... } This meant that now user_regs_struct was off by 1 reg w.r.t pt_regs and signal code needs to user_regs_struct.scratch to reflect it as pt_regs, which is what this commit does. This problem was hidden for 2 years, because both save/restore, despite using wrong location, were using the same location. Only an interim inspection (reproducer below) exposed the issue. void handle_segv(int signo, siginfo_t *info, void *context) { ucontext_t *uc = context; struct user_regs_struct *regs = &(uc->uc_mcontext.regs); printf("regs %x %x\n", <=== prints 7 8 (vs. 8 9) regs->scratch.r8, regs->scratch.r9); } int main() { struct sigaction sa; sa.sa_sigaction = handle_segv; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sigaction(SIGSEGV, &sa, NULL); asm volatile( "mov r7, 7 \n" "mov r8, 8 \n" "mov r9, 9 \n" "mov r10, 10 \n" :::"r7","r8","r9","r10"); *((unsigned int*)0x10) = 0; } Fixes: 2fa919045b72ec892e "ARC: pt_regs update #2: Remove unused gutter at start of pt_regs" Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index 7e95e1a86510..d68b410595c8 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -67,7 +67,7 @@ stash_usr_regs(struct rt_sigframe __user *sf, struct pt_regs *regs, sigset_t *set) { int err; - err = __copy_to_user(&(sf->uc.uc_mcontext.regs), regs, + err = __copy_to_user(&(sf->uc.uc_mcontext.regs.scratch), regs, sizeof(sf->uc.uc_mcontext.regs.scratch)); err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(sigset_t)); @@ -83,7 +83,7 @@ static int restore_usr_regs(struct pt_regs *regs, struct rt_sigframe __user *sf) if (!err) set_current_blocked(&set); - err |= __copy_from_user(regs, &(sf->uc.uc_mcontext.regs), + err |= __copy_from_user(regs, &(sf->uc.uc_mcontext.regs.scratch), sizeof(sf->uc.uc_mcontext.regs.scratch)); return err; From 93aad6489861adb737296b48c9f4484ab7b04b7c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 23 Mar 2015 18:01:35 -0700 Subject: [PATCH 189/236] selinux: fix sel_write_enforce broken return value commit 6436a123a147db51a0b06024a8350f4c230e73ff upstream. Return a negative error value like the rest of the entries in this function. Signed-off-by: Joe Perches Acked-by: Stephen Smalley [PM: tweaked subject line] Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- security/selinux/selinuxfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index d60c0ee66387..6c4cbd97a673 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -152,7 +152,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf, goto out; /* No partial writes. */ - length = EINVAL; + length = -EINVAL; if (*ppos != 0) goto out; From 79ea2bfc7d9cac1949bec2cb09b622becb7d54b6 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 15 Apr 2015 19:00:32 +0100 Subject: [PATCH 190/236] tcp: Fix crash in TCP Fast Open Commit 355a901e6cf1 ("tcp: make connect() mem charging friendly") changed tcp_send_syn_data() to perform an open-coded copy of the 'syn' skb rather than using skb_copy_expand(). The open-coded copy does not cover the skb_shared_info::gso_segs field, so in the new skb it is left set to 0. When this commit was backported into stable branches between 3.10.y and 3.16.7-ckty inclusive, it triggered the BUG() in tcp_transmit_skb(). Since Linux 3.18 the GSO segment count is kept in the tcp_skb_cb::tcp_gso_segs field and tcp_send_syn_data() does copy the tcp_skb_cb structure to the new skb, so mainline and newer stable branches are not affected. Set skb_shared_info::gso_segs to the correct value of 1. Signed-off-by: Ben Hutchings Acked-by: Eric Dumazet Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d0c310801479..96f64e59d70c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2933,6 +2933,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); + skb_shinfo(syn_data)->gso_segs = 1; if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), fo->data->msg_iov, 0, space))) { kfree_skb(syn_data); From 5ab0f4da798ac88c2f320806b872212cabbc7f5d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Jan 2015 19:12:57 +0100 Subject: [PATCH 191/236] btrfs: simplify insert_orphan_item commit 9c4f61f01d269815bb7c37be3ede59c5587747c6 upstream. We can search and add the orphan item in one go, btrfs_insert_orphan_item will find out if the item already exists. Signed-off-by: David Sterba Cc: Chris Mason Cc: Roman Mamedov Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a7f32bfdd5e7..ec8b6542c8bd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1235,21 +1235,13 @@ out: } static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) + struct btrfs_root *root, u64 ino) { int ret; - struct btrfs_path *path; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_find_item(root, path, BTRFS_ORPHAN_OBJECTID, - offset, BTRFS_ORPHAN_ITEM_KEY, NULL); - if (ret > 0) - ret = btrfs_insert_orphan_item(trans, root, offset); - - btrfs_free_path(path); + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; return ret; } From 485d37770f43e476566ec83c8059c81989a86301 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 14 Sep 2014 16:47:52 +0300 Subject: [PATCH 192/236] IB/core: Avoid leakage from kernel to user space commit 377b513485fd885dea1083a9a5430df65b35e048 upstream. Clear the reserved field of struct ib_uverbs_async_event_desc which is copied to user space. Signed-off-by: Eli Cohen Reviewed-by: Yann Droneaud Signed-off-by: Roland Dreier Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/uverbs_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 08219fb3338b..7a515c867674 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -476,6 +476,7 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file, entry->desc.async.element = element; entry->desc.async.event_type = event; + entry->desc.async.reserved = 0; entry->counter = counter; list_add_tail(&entry->list, &file->async_file->event_list); From f40fc98d0b046ba153442b3bfd7887437f2ebd30 Mon Sep 17 00:00:00 2001 From: Shachar Raindel Date: Wed, 18 Mar 2015 17:39:08 +0000 Subject: [PATCH 193/236] IB/uverbs: Prevent integer overflow in ib_umem_get address arithmetic commit 8494057ab5e40df590ef6ef7d66324d3ae33356b upstream. Properly verify that the resulting page aligned end address is larger than both the start address and the length of the memory area requested. Both the start and length arguments for ib_umem_get are controlled by the user. A misbehaving user can provide values which will cause an integer overflow when calculating the page aligned end address. This overflow can cause also miscalculation of the number of pages mapped, and additional logic issues. Addresses: CVE-2014-8159 Signed-off-by: Shachar Raindel Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/umem.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a84112322071..055ebebc07dd 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -94,6 +94,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (dmasync) dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + /* + * If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if ((PAGE_ALIGN(addr + size) <= size) || + (PAGE_ALIGN(addr + size) <= addr)) + return ERR_PTR(-EINVAL); + if (!can_do_mlock()) return ERR_PTR(-EPERM); From 98163132369d94dbd1d6268d7f549342fa6c4e7a Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 16 Mar 2015 09:08:07 +0200 Subject: [PATCH 194/236] iwlwifi: dvm: run INIT firmware again upon .start() commit 9c8928f5176766bec79f272bd47b7124e11cccbd upstream. The assumption before this patch was that we don't need to run again the INIT firmware after the system booted. The INIT firmware runs calibrations which impact the physical layer's behavior. Users reported that it may be helpful to run these calibrations again every time the interface is brought up. The penatly is minimal, since the calibrations run fast. This fixes: https://bugzilla.kernel.org/show_bug.cgi?id=94341 Signed-off-by: Emmanuel Grumbach Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/iwlwifi/dvm/dev.h | 1 - drivers/net/wireless/iwlwifi/dvm/ucode.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/dvm/dev.h b/drivers/net/wireless/iwlwifi/dvm/dev.h index 3441f70d0ff9..6e8cdb8a0cc5 100644 --- a/drivers/net/wireless/iwlwifi/dvm/dev.h +++ b/drivers/net/wireless/iwlwifi/dvm/dev.h @@ -708,7 +708,6 @@ struct iwl_priv { unsigned long reload_jiffies; int reload_count; bool ucode_loaded; - bool init_ucode_run; /* Don't run init uCode again */ u8 plcp_delta_threshold; diff --git a/drivers/net/wireless/iwlwifi/dvm/ucode.c b/drivers/net/wireless/iwlwifi/dvm/ucode.c index cf03ef5619d9..8b2dedc30159 100644 --- a/drivers/net/wireless/iwlwifi/dvm/ucode.c +++ b/drivers/net/wireless/iwlwifi/dvm/ucode.c @@ -418,9 +418,6 @@ int iwl_run_init_ucode(struct iwl_priv *priv) if (!priv->fw->img[IWL_UCODE_INIT].sec[0].len) return 0; - if (priv->init_ucode_run) - return 0; - iwl_init_notification_wait(&priv->notif_wait, &calib_wait, calib_complete, ARRAY_SIZE(calib_complete), iwlagn_wait_calib, priv); @@ -440,8 +437,6 @@ int iwl_run_init_ucode(struct iwl_priv *priv) */ ret = iwl_wait_notification(&priv->notif_wait, &calib_wait, UCODE_CALIB_TIMEOUT); - if (!ret) - priv->init_ucode_run = true; goto out; From 95d7ade78a9dc90b71396d1ddb9da403e03418f2 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 27 Jan 2015 18:08:22 +0530 Subject: [PATCH 195/236] nbd: fix possible memory leak commit ff6b8090e26ef7649ef0cc6b42389141ef48b0cf upstream. we have already allocated memory for nbd_dev, but we were not releasing that memory and just returning the error value. Signed-off-by: Sudip Mukherjee Acked-by: Paul Clements Signed-off-by: Markus Pargmann Signed-off-by: Greg Kroah-Hartman --- drivers/block/nbd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 55298db36b2d..d18093681af2 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -814,10 +814,6 @@ static int __init nbd_init(void) return -EINVAL; } - nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL); - if (!nbd_dev) - return -ENOMEM; - part_shift = 0; if (max_part > 0) { part_shift = fls(max_part); @@ -839,6 +835,10 @@ static int __init nbd_init(void) if (nbds_max > 1UL << (MINORBITS - part_shift)) return -EINVAL; + nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL); + if (!nbd_dev) + return -ENOMEM; + for (i = 0; i < nbds_max; i++) { struct gendisk *disk = alloc_disk(1 << part_shift); if (!disk) From 4c605b1d706cff4c4c43e1d0712d19478ef880ab Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 25 Mar 2015 15:55:20 -0700 Subject: [PATCH 196/236] mm/memory hotplug: postpone the reset of obsolete pgdat commit b0dc3a342af36f95a68fe229b8f0f73552c5ca08 upstream. Qiu Xishi reported the following BUG when testing hot-add/hot-remove node under stress condition: BUG: unable to handle kernel paging request at 0000000000025f60 IP: next_online_pgdat+0x1/0x50 PGD 0 Oops: 0000 [#1] SMP ACPI: Device does not support D3cold Modules linked in: fuse nls_iso8859_1 nls_cp437 vfat fat loop dm_mod coretemp mperf crc32c_intel ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul glue_helper aes_x86_64 pcspkr microcode igb dca i2c_algo_bit ipv6 megaraid_sas iTCO_wdt i2c_i801 i2c_core iTCO_vendor_support tg3 sg hwmon ptp lpc_ich pps_core mfd_core acpi_pad rtc_cmos button ext3 jbd mbcache sd_mod crc_t10dif scsi_dh_alua scsi_dh_rdac scsi_dh_hp_sw scsi_dh_emc scsi_dh ahci libahci libata scsi_mod [last unloaded: rasf] CPU: 23 PID: 238 Comm: kworker/23:1 Tainted: G O 3.10.15-5885-euler0302 #1 Hardware name: HUAWEI TECHNOLOGIES CO.,LTD. Huawei N1/Huawei N1, BIOS V100R001 03/02/2015 Workqueue: events vmstat_update task: ffffa800d32c0000 ti: ffffa800d32ae000 task.ti: ffffa800d32ae000 RIP: 0010: next_online_pgdat+0x1/0x50 RSP: 0018:ffffa800d32afce8 EFLAGS: 00010286 RAX: 0000000000001440 RBX: ffffffff81da53b8 RCX: 0000000000000082 RDX: 0000000000000000 RSI: 0000000000000082 RDI: 0000000000000000 RBP: ffffa800d32afd28 R08: ffffffff81c93bfc R09: ffffffff81cbdc96 R10: 00000000000040ec R11: 00000000000000a0 R12: ffffa800fffb3440 R13: ffffa800d32afd38 R14: 0000000000000017 R15: ffffa800e6616800 FS: 0000000000000000(0000) GS:ffffa800e6600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000025f60 CR3: 0000000001a0b000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: refresh_cpu_vm_stats+0xd0/0x140 vmstat_update+0x11/0x50 process_one_work+0x194/0x3d0 worker_thread+0x12b/0x410 kthread+0xc6/0xd0 ret_from_fork+0x7c/0xb0 The cause is the "memset(pgdat, 0, sizeof(*pgdat))" at the end of try_offline_node, which will reset all the content of pgdat to 0, as the pgdat is accessed lock-free, so that the users still using the pgdat will panic, such as the vmstat_update routine. process A: offline node XX: vmstat_updat() refresh_cpu_vm_stats() for_each_populated_zone() find online node XX cond_resched() offline cpu and memory, then try_offline_node() node_set_offline(nid), and memset(pgdat, 0, sizeof(*pgdat)) zone = next_zone(zone) pg_data_t *pgdat = zone->zone_pgdat; // here pgdat is NULL now next_online_pgdat(pgdat) next_online_node(pgdat->node_id); // NULL pointer access So the solution here is postponing the reset of obsolete pgdat from try_offline_node() to hotadd_new_pgdat(), and just resetting pgdat->nr_zones and pgdat->classzone_idx to be 0 rather than the memset 0 to avoid breaking pointer information in pgdat. Signed-off-by: Gu Zheng Reported-by: Xishi Qiu Suggested-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Tang Chen Cc: Xie XiuQi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memory_hotplug.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f6f23833de44..c8e450132813 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1016,6 +1016,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) return NULL; arch_refresh_nodedata(nid, pgdat); + } else { + /* Reset the nr_zones and classzone_idx to 0 before reuse */ + pgdat->nr_zones = 0; + pgdat->classzone_idx = 0; } /* we can use NODE_DATA(nid) from here */ @@ -1863,15 +1867,6 @@ void try_offline_node(int nid) if (is_vmalloc_addr(zone->wait_table)) vfree(zone->wait_table); } - - /* - * Since there is no way to guarentee the address of pgdat/zone is not - * on stack of any kernel threads or used by other kernel objects - * without reference counting or other symchronizing method, do not - * reset node_data and free pgdat here. Just reset it to 0 and reuse - * the memory when the node is online again. - */ - memset(pgdat, 0, sizeof(*pgdat)); } EXPORT_SYMBOL(try_offline_node); From da715490285d40afb34148891a2b720043633ad8 Mon Sep 17 00:00:00 2001 From: Brian Silverman Date: Wed, 18 Feb 2015 16:23:56 -0800 Subject: [PATCH 197/236] sched: Fix RLIMIT_RTTIME when PI-boosting to RT commit 746db9443ea57fd9c059f62c4bfbf41cf224fe13 upstream. When non-realtime tasks get priority-inheritance boosted to a realtime scheduling class, RLIMIT_RTTIME starts to apply to them. However, the counter used for checking this (the same one used for SCHED_RR timeslices) was not getting reset. This meant that tasks running with a non-realtime scheduling class which are repeatedly boosted to a realtime one, but never block while they are running realtime, eventually hit the timeout without ever running for a time over the limit. This patch resets the realtime timeslice counter when un-PI-boosting from an RT to a non-RT scheduling class. I have some test code with two threads and a shared PTHREAD_PRIO_INHERIT mutex which induces priority boosting and spins while boosted that gets killed by a SIGXCPU on non-fixed kernels but doesn't with this patch applied. It happens much faster with a CONFIG_PREEMPT_RT kernel, and does happen eventually with PREEMPT_VOLUNTARY kernels. Signed-off-by: Brian Silverman Signed-off-by: Peter Zijlstra (Intel) Cc: austin@peloton-tech.com Link: http://lkml.kernel.org/r/1424305436-6716-1-git-send-email-brian@peloton-tech.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a3f3c4e1f5a..5e973efc036e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2980,6 +2980,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } else { if (dl_prio(oldprio)) p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; p->sched_class = &fair_sched_class; } From d8b274f40cb5cf5ef5cd18f9830a60233011b631 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Mar 2015 10:37:43 -0500 Subject: [PATCH 198/236] writeback: add missing INITIAL_JIFFIES init in global_update_bandwidth() commit 7d70e15480c0450d2bfafaad338a32e884fc215e upstream. global_update_bandwidth() uses static variable update_time as the timestamp for the last update but forgets to initialize it to INITIALIZE_JIFFIES. This means that global_dirty_limit will be 5 mins into the future on 32bit and some large amount jiffies into the past on 64bit. This isn't critical as the only effect is that global_dirty_limit won't be updated for the first 5 mins after booting on 32bit machines, especially given the auxiliary nature of global_dirty_limit's role - protecting against global dirty threshold's sudden dips; however, it does lead to unintended suboptimal behavior. Fix it. Fixes: c42843f2f0bb ("writeback: introduce smoothed global dirty limit") Signed-off-by: Tejun Heo Acked-by: Jan Kara Cc: Wu Fengguang Cc: Jens Axboe Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f45f87a5859..d3653325a255 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -943,7 +943,7 @@ static void global_update_bandwidth(unsigned long thresh, unsigned long now) { static DEFINE_SPINLOCK(dirty_lock); - static unsigned long update_time; + static unsigned long update_time = INITIAL_JIFFIES; /* * check locklessly first to optimize away locking for the most time From fbcec5461485e43b7a3324aa06b5a22e1b872e41 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Mar 2015 00:18:48 -0400 Subject: [PATCH 199/236] writeback: fix possible underflow in write bandwidth calculation commit c72efb658f7c8b27ca3d0efb5cfd5ded9fcac89e upstream. From 1ebf33901ecc75d9496862dceb1ef0377980587c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Mar 2015 00:08:19 -0400 2f800fbd777b ("writeback: fix dirtied pages accounting on redirty") introduced account_page_redirty() which reverts stat updates for a redirtied page, making BDI_DIRTIED no longer monotonically increasing. bdi_update_write_bandwidth() uses the delta in BDI_DIRTIED as the basis for bandwidth calculation. While unlikely, since the above patch, the newer value may be lower than the recorded past value and underflow the bandwidth calculation leading to a wild result. Fix it by subtracing min of the old and new values when calculating delta. AFAIK, there hasn't been any report of it happening but the resulting erratic behavior would be non-critical and temporary, so it's possible that the issue is happening without being reported. The risk of the fix is very low, so tagged for -stable. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Fixes: 2f800fbd777b ("writeback: fix dirtied pages accounting on redirty") Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- mm/page-writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d3653325a255..51d8d15f48d7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -878,8 +878,11 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period + * + * @written may have decreased due to account_page_redirty(). + * Avoid underflowing @bw calculation. */ - bw = written - bdi->written_stamp; + bw = written - min(written, bdi->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); From 76a1444b9961eca2697f4b121a40e76b70e87a82 Mon Sep 17 00:00:00 2001 From: David Miller Date: Wed, 18 Mar 2015 23:18:40 -0400 Subject: [PATCH 200/236] radeon: Do not directly dereference pointers to BIOS area. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f2c9e560b406f2f6b14b345c7da33467dee9cdf2 upstream. Use readb() and memcpy_fromio() accessors instead. Reviewed-by: Christian König Signed-off-by: David S. Miller Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_bios.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_bios.c b/drivers/gpu/drm/radeon/radeon_bios.c index 9ab30976287d..c43335ce8778 100644 --- a/drivers/gpu/drm/radeon/radeon_bios.c +++ b/drivers/gpu/drm/radeon/radeon_bios.c @@ -76,7 +76,7 @@ static bool igp_read_bios_from_vram(struct radeon_device *rdev) static bool radeon_read_bios(struct radeon_device *rdev) { - uint8_t __iomem *bios; + uint8_t __iomem *bios, val1, val2; size_t size; rdev->bios = NULL; @@ -86,15 +86,19 @@ static bool radeon_read_bios(struct radeon_device *rdev) return false; } - if (size == 0 || bios[0] != 0x55 || bios[1] != 0xaa) { + val1 = readb(&bios[0]); + val2 = readb(&bios[1]); + + if (size == 0 || val1 != 0x55 || val2 != 0xaa) { pci_unmap_rom(rdev->pdev, bios); return false; } - rdev->bios = kmemdup(bios, size, GFP_KERNEL); + rdev->bios = kzalloc(size, GFP_KERNEL); if (rdev->bios == NULL) { pci_unmap_rom(rdev->pdev, bios); return false; } + memcpy_fromio(rdev->bios, bios, size); pci_unmap_rom(rdev->pdev, bios); return true; } From fd4ce47308e2987035ddc3293757914b2dcff2a4 Mon Sep 17 00:00:00 2001 From: Doug Goldstein Date: Sun, 15 Mar 2015 21:56:04 -0500 Subject: [PATCH 201/236] USB: ftdi_sio: Added custom PID for Synapse Wireless product commit 4899c054a90439477b24da8977db8d738376fe90 upstream. Synapse Wireless uses the FTDI VID with a custom PID of 0x9090 for their SNAP Stick 200 product. Signed-off-by: Doug Goldstein Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ftdi_sio.c | 1 + drivers/usb/serial/ftdi_sio_ids.h | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 923500595357..c738b5fca8db 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -617,6 +617,7 @@ static const struct usb_device_id id_table_combined[] = { .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, { USB_DEVICE(FTDI_VID, FTDI_NT_ORIONLXM_PID), .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, + { USB_DEVICE(FTDI_VID, FTDI_SYNAPSE_SS200_PID) }, /* * ELV devices: */ diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 56b1b55c4751..4e4f46f3c89c 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -561,6 +561,12 @@ */ #define FTDI_NT_ORIONLXM_PID 0x7c90 /* OrionLXm Substation Automation Platform */ +/* + * Synapse Wireless product ids (FTDI_VID) + * http://www.synapse-wireless.com + */ +#define FTDI_SYNAPSE_SS200_PID 0x9090 /* SS200 - SNAP Stick 200 */ + /********************************/ /** third-party VID/PID combos **/ From 244f310987d1764dfdf890b43c13b883dada745d Mon Sep 17 00:00:00 2001 From: Doug Goldstein Date: Mon, 23 Mar 2015 20:34:48 -0500 Subject: [PATCH 202/236] USB: ftdi_sio: Use jtag quirk for SNAP Connect E10 commit b229a0f840f774d29d8fedbf5deb344ca36b7f1a upstream. This patch uses the existing CALAO Systems ftdi_8u2232c_probe in order to avoid attaching a TTY to the JTAG port as this board is based on the CALAO Systems reference design and needs the same fix up. Signed-off-by: Doug Goldstein [johan: clean up probe logic ] Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ftdi_sio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index c738b5fca8db..2d858f81ab33 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1902,8 +1902,12 @@ static int ftdi_8u2232c_probe(struct usb_serial *serial) { struct usb_device *udev = serial->dev; - if ((udev->manufacturer && !strcmp(udev->manufacturer, "CALAO Systems")) || - (udev->product && !strcmp(udev->product, "BeagleBone/XDS100V2"))) + if (udev->manufacturer && !strcmp(udev->manufacturer, "CALAO Systems")) + return ftdi_jtag_probe(serial); + + if (udev->product && + (!strcmp(udev->product, "BeagleBone/XDS100V2") || + !strcmp(udev->product, "SNAP Connect E10"))) return ftdi_jtag_probe(serial); return 0; From 7d4f2fa495097c0992d7a2cad197295f60179c41 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 4 Mar 2015 10:31:47 +0100 Subject: [PATCH 203/236] Defer processing of REQ_PREEMPT requests for blocked devices commit bba0bdd7ad4713d82338bcd9b72d57e9335a664b upstream. SCSI transport drivers and SCSI LLDs block a SCSI device if the transport layer is not operational. This means that in this state no requests should be processed, even if the REQ_PREEMPT flag has been set. This patch avoids that a rescan shortly after a cable pull sporadically triggers the following kernel oops: BUG: unable to handle kernel paging request at ffffc9001a6bc084 IP: [] mlx4_ib_post_send+0xd2/0xb30 [mlx4_ib] Process rescan-scsi-bus (pid: 9241, threadinfo ffff88053484a000, task ffff880534aae100) Call Trace: [] srp_post_send+0x65/0x70 [ib_srp] [] srp_queuecommand+0x1cf/0x3e0 [ib_srp] [] scsi_dispatch_cmd+0x101/0x280 [scsi_mod] [] scsi_request_fn+0x411/0x4d0 [scsi_mod] [] __blk_run_queue+0x27/0x30 [] blk_execute_rq_nowait+0x82/0x110 [] blk_execute_rq+0x62/0xf0 [] scsi_execute+0xe8/0x190 [scsi_mod] [] scsi_execute_req+0xa3/0x130 [scsi_mod] [] scsi_probe_lun+0x17a/0x450 [scsi_mod] [] scsi_probe_and_add_lun+0x156/0x480 [scsi_mod] [] __scsi_scan_target+0xdf/0x1f0 [scsi_mod] [] scsi_scan_host_selected+0x183/0x1c0 [scsi_mod] [] scsi_scan+0xdb/0xe0 [scsi_mod] [] store_scan+0x13/0x20 [scsi_mod] [] sysfs_write_file+0xcb/0x160 [] vfs_write+0xce/0x140 [] sys_write+0x53/0xa0 [] system_call_fastpath+0x16/0x1b [<00007f611c9d9300>] 0x7f611c9d92ff Reported-by: Max Gurtuvoy Signed-off-by: Bart Van Assche Reviewed-by: Mike Christie Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_lib.c | 4 +++- include/linux/blk_types.h | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 64e487a8bf59..719bd8257520 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1258,9 +1258,11 @@ int scsi_prep_state_check(struct scsi_device *sdev, struct request *req) "rejecting I/O to dead device\n"); ret = BLKPREP_KILL; break; - case SDEV_QUIESCE: case SDEV_BLOCK: case SDEV_CREATED_BLOCK: + ret = BLKPREP_DEFER; + break; + case SDEV_QUIESCE: /* * If the devices is blocked we defer normal commands. */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index bbc3a6c88fce..33fd7ff53a77 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -180,7 +180,9 @@ enum rq_flag_bits { __REQ_ELVPRIV, /* elevator private data attached */ __REQ_FAILED, /* set if the request failed */ __REQ_QUIET, /* don't worry about errors */ - __REQ_PREEMPT, /* set for "ide_preempt" requests */ + __REQ_PREEMPT, /* set for "ide_preempt" requests and also + for requests for which the SCSI "quiesce" + state must be ignored. */ __REQ_ALLOCED, /* request came from our alloc pool */ __REQ_COPY_USER, /* contains copies of user pages */ __REQ_FLUSH_SEQ, /* request for flush sequence */ From 1d780d7aa25a0f0081144b5a63db23e16e9ba320 Mon Sep 17 00:00:00 2001 From: Viorel Suman Date: Wed, 18 Feb 2015 20:05:21 +0200 Subject: [PATCH 204/236] iio: inv_mpu6050: Clear timestamps fifo while resetting hardware fifo commit 4dac0a8eefd55bb1f157d1a5a084531334a2d74c upstream. A hardware fifo reset always imply an invalidation of the existing timestamps, so we'll clear timestamps fifo on successfull hardware fifo reset. Signed-off-by: Viorel Suman Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c | 25 ++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c index 429517117eff..30fce6723e61 100644 --- a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c +++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c @@ -25,6 +25,16 @@ #include #include "inv_mpu_iio.h" +static void inv_clear_kfifo(struct inv_mpu6050_state *st) +{ + unsigned long flags; + + /* take the spin lock sem to avoid interrupt kick in */ + spin_lock_irqsave(&st->time_stamp_lock, flags); + kfifo_reset(&st->timestamps); + spin_unlock_irqrestore(&st->time_stamp_lock, flags); +} + int inv_reset_fifo(struct iio_dev *indio_dev) { int result; @@ -51,6 +61,10 @@ int inv_reset_fifo(struct iio_dev *indio_dev) INV_MPU6050_BIT_FIFO_RST); if (result) goto reset_fifo_fail; + + /* clear timestamps fifo */ + inv_clear_kfifo(st); + /* enable interrupt */ if (st->chip_config.accl_fifo_enable || st->chip_config.gyro_fifo_enable) { @@ -84,16 +98,6 @@ reset_fifo_fail: return result; } -static void inv_clear_kfifo(struct inv_mpu6050_state *st) -{ - unsigned long flags; - - /* take the spin lock sem to avoid interrupt kick in */ - spin_lock_irqsave(&st->time_stamp_lock, flags); - kfifo_reset(&st->timestamps); - spin_unlock_irqrestore(&st->time_stamp_lock, flags); -} - /** * inv_mpu6050_irq_handler() - Cache a timestamp at each data ready interrupt. */ @@ -185,7 +189,6 @@ end_session: flush_fifo: /* Flush HW and SW FIFOs. */ inv_reset_fifo(indio_dev); - inv_clear_kfifo(st); mutex_unlock(&indio_dev->mlock); iio_trigger_notify_done(indio_dev->trig); From aef709320362aa4c86e96e1e48fb6c3aa357d17f Mon Sep 17 00:00:00 2001 From: Darshana Padmadas Date: Sat, 28 Mar 2015 12:07:14 +0530 Subject: [PATCH 205/236] iio: imu: Use iio_trigger_get for indio_dev->trig assignment commit 4ce7ca89d6e8eae9e201cd0e972ba323f33e2fb4 upstream. This patch uses iio_trigger_get to increment the reference count of trigger device, to avoid incorrect assignment. Can result in a null pointer dereference during removal if the trigger has been changed before removal. This patch refers to a similar situation encountered through the following discussion: http://www.spinics.net/lists/linux-iio/msg13669.html Signed-off-by: Darshana Padmadas Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/imu/adis_trigger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/imu/adis_trigger.c b/drivers/iio/imu/adis_trigger.c index e0017c22bb9c..f53e9a803a0e 100644 --- a/drivers/iio/imu/adis_trigger.c +++ b/drivers/iio/imu/adis_trigger.c @@ -60,7 +60,7 @@ int adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev) iio_trigger_set_drvdata(adis->trig, adis); ret = iio_trigger_register(adis->trig); - indio_dev->trig = adis->trig; + indio_dev->trig = iio_trigger_get(adis->trig); if (ret) goto error_free_irq; From 8b4104be9c087876bd89b47ad29f2dc136655465 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 27 Mar 2015 13:35:52 +0200 Subject: [PATCH 206/236] dmaengine: omap-dma: Fix memory leak when terminating running transfer commit 02d88b735f5a60f04dbf6d051b76e1877a0d0844 upstream. In omap_dma_start_desc the vdesc->node is removed from the virt-dma framework managed lists (to be precise from the desc_issued list). If a terminate_all comes before the transfer finishes the omap_desc will not be freed up because it is not in any of the lists and we stopped the DMA channel so the transfer will not going to complete. There is no special sequence for leaking memory when using cyclic (audio) transfer: with every start and stop of a cyclic transfer the driver leaks struct omap_desc worth of memory. Free up the allocated memory directly in omap_dma_terminate_all() since the framework will not going to do that for us. Signed-off-by: Peter Ujfalusi CC: Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/omap-dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c index 362e7c49f2e1..12f82942e347 100644 --- a/drivers/dma/omap-dma.c +++ b/drivers/dma/omap-dma.c @@ -487,6 +487,7 @@ static int omap_dma_terminate_all(struct omap_chan *c) * c->desc is NULL and exit.) */ if (c->desc) { + omap_dma_desc_free(&c->desc->vd); c->desc = NULL; /* Avoid stopping the dma twice */ if (!c->paused) From 3a5845d8b9b6a68b9a1d50abd4739418fc17c855 Mon Sep 17 00:00:00 2001 From: Stefan Lippers-Hollmann Date: Mon, 30 Mar 2015 22:44:27 +0200 Subject: [PATCH 207/236] x86/reboot: Add ASRock Q1900DC-ITX mainboard reboot quirk commit 80313b3078fcd2ca51970880d90757f05879a193 upstream. The ASRock Q1900DC-ITX mainboard (Baytrail-D) hangs randomly in both BIOS and UEFI mode while rebooting unless reboot=pci is used. Add a quirk to reboot via the pci method. The problem is very intermittent and hard to debug, it might succeed rebooting just fine 40 times in a row - but fails half a dozen times the next day. It seems to be slightly less common in BIOS CSM mode than native UEFI (with the CSM disabled), but it does happen in either mode. Since I've started testing this patch in late january, rebooting has been 100% reliable. Most of the time it already hangs during POST, but occasionally it might even make it through the bootloader and the kernel might even start booting, but then hangs before the mode switch. The same symptoms occur with grub-efi, gummiboot and grub-pc, just as well as (at least) kernel 3.16-3.19 and 4.0-rc6 (I haven't tried older kernels than 3.16). Upgrading to the most current mainboard firmware of the ASRock Q1900DC-ITX, version 1.20, does not improve the situation. ( Searching the web seems to suggest that other Bay Trail-D mainboards might be affected as well. ) -- Signed-off-by: Stefan Lippers-Hollmann Cc: Matt Fleming Link: http://lkml.kernel.org/r/20150330224427.0fb58e42@mir Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/reboot.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index c752cb43e52f..a6aa91f77654 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -181,6 +181,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + /* ASRock */ + { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ + .callback = set_pci_reboot, + .ident = "ASRock Q1900DC-ITX", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"), + DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"), + }, + }, + /* ASUS */ { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, From 92fe878fdabf6b29d1550f40dc65eaa249c19e57 Mon Sep 17 00:00:00 2001 From: Andreas Werner Date: Sun, 22 Mar 2015 17:35:52 +0100 Subject: [PATCH 208/236] can: flexcan: Deferred on Regulator return EPROBE_DEFER commit 555828ef45f825d6ee06559f0304163550eed380 upstream. Return EPROBE_DEFER if Regulator returns EPROBE_DEFER If the Flexcan driver is built into kernel and a regulator is used to enable the CAN transceiver, the Flexcan driver may not use the regulator. When initializing the Flexcan device with a regulator defined in the device tree, but not initialized, the regulator subsystem returns EPROBE_DEFER, hence the Flexcan init fails. The solution for this is to return EPROBE_DEFER if regulator is not initialized and wait until the regulator is initialized. Signed-off-by: Andreas Werner Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/flexcan.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 61376abdab39..dbd8d21a8f10 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -1095,12 +1095,19 @@ static int flexcan_probe(struct platform_device *pdev) const struct flexcan_devtype_data *devtype_data; struct net_device *dev; struct flexcan_priv *priv; + struct regulator *reg_xceiver; struct resource *mem; struct clk *clk_ipg = NULL, *clk_per = NULL; void __iomem *base; int err, irq; u32 clock_freq = 0; + reg_xceiver = devm_regulator_get(&pdev->dev, "xceiver"); + if (PTR_ERR(reg_xceiver) == -EPROBE_DEFER) + return -EPROBE_DEFER; + else if (IS_ERR(reg_xceiver)) + reg_xceiver = NULL; + if (pdev->dev.of_node) of_property_read_u32(pdev->dev.of_node, "clock-frequency", &clock_freq); @@ -1162,9 +1169,7 @@ static int flexcan_probe(struct platform_device *pdev) priv->pdata = dev_get_platdata(&pdev->dev); priv->devtype_data = devtype_data; - priv->reg_xceiver = devm_regulator_get(&pdev->dev, "xceiver"); - if (IS_ERR(priv->reg_xceiver)) - priv->reg_xceiver = NULL; + priv->reg_xceiver = reg_xceiver; netif_napi_add(dev, &priv->napi, flexcan_poll, FLEXCAN_NAPI_WEIGHT); From 6ac0e0a446a0d07b83bd8bc09b439b7785a9f089 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 31 Mar 2015 20:15:09 +0200 Subject: [PATCH 209/236] cpuidle: remove state_count field from struct cpuidle_device commit d75e4af14e228bbe3f86e29bcecb8e6be98d4e04 upstream. Thomas Schlichter reports the following issue on his Samsung NC20: "The C-states C1 and C2 to the OS when connected to AC, and additionally provides the C3 C-state when disconnected from AC. However, the number of C-states shown in sysfs is fixed to the number of C-states present at boot. If I boot with AC connected, I always only see the C-states up to C2 even if I disconnect AC. The reason is commit 130a5f692425 (ACPI / cpuidle: remove dev->state_count setting). It removes the update of dev->state_count, but sysfs uses exactly this variable to show the C-states. The fix is to use drv->state_count in sysfs. As this is currently the last user of dev->state_count, this variable can be completely removed." Remove dev->state_count as per the above. Reported-by: Thomas Schlichter Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Acked-by: Daniel Lezcano [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/cpuidle/cpuidle.c | 3 --- drivers/cpuidle/sysfs.c | 5 +++-- include/linux/cpuidle.h | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index a55e68f2cfc8..e3d2052e7552 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -252,9 +252,6 @@ int cpuidle_enable_device(struct cpuidle_device *dev) if (!dev->registered) return -EINVAL; - if (!dev->state_count) - dev->state_count = drv->state_count; - ret = cpuidle_add_device_sysfs(dev); if (ret) return ret; diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index e918b6d0caf7..dcaae4c8bc08 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -398,7 +398,7 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device); /* state statistics */ - for (i = 0; i < device->state_count; i++) { + for (i = 0; i < drv->state_count; i++) { kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); if (!kobj) goto error_state; @@ -430,9 +430,10 @@ error_state: */ static void cpuidle_remove_state_sysfs(struct cpuidle_device *device) { + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device); int i; - for (i = 0; i < device->state_count; i++) + for (i = 0; i < drv->state_count; i++) cpuidle_free_state_kobj(device, i); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 50fcbb0ac4e7..d1338170b6b9 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -69,7 +69,6 @@ struct cpuidle_device { unsigned int cpu; int last_residency; - int state_count; struct cpuidle_state_usage states_usage[CPUIDLE_STATE_MAX]; struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX]; struct cpuidle_driver_kobj *kobj_driver; From c1d9552b16f62bd4683df89905bb61e14d0b6f20 Mon Sep 17 00:00:00 2001 From: Thomas Schlichter Date: Tue, 31 Mar 2015 20:24:39 +0200 Subject: [PATCH 210/236] cpuidle: ACPI: do not overwrite name and description of C0 commit c7e8bdf5872c5a8f5a6494e16fe839c38a0d3d3d upstream. Fix a bug that leads to showing the name and description of C-state C0 as "" in sysfs after the ACPI C-states changed (e.g. after AC->DC or DC->AC transition). The function poll_idle_init() in drivers/cpuidle/driver.c initializes the state 0 during cpuidle_register_driver(), so we better do not overwrite it again with '\0' during acpi_processor_cst_has_changed(). Signed-off-by: Thomas Schlichter Reviewed-by: Bartlomiej Zolnierkiewicz Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/processor_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 17f9ec501972..fd8496a92b45 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -962,7 +962,7 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr) return -EINVAL; drv->safe_state_index = -1; - for (i = 0; i < CPUIDLE_STATE_MAX; i++) { + for (i = CPUIDLE_DRIVER_STATE_START; i < CPUIDLE_STATE_MAX; i++) { drv->states[i].name[0] = '\0'; drv->states[i].desc[0] = '\0'; } From adedb712076cdd762ddcf1f97f94d866bb3cc4a7 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 23 Mar 2015 18:27:41 +0200 Subject: [PATCH 211/236] usb: xhci: handle Config Error Change (CEC) in xhci driver commit 9425183d177aa4a2f09d01a74925124f0778b595 upstream. Linux xHCI driver doesn't report and handle port cofig error change. If Port Configure Error for root hub port occurs, CEC bit in PORTSC would be set by xHC and remains 1. This happends when the root port fails to configure its link partner, e.g. the port fails to exchange port capabilities information using Port Capability LMPs. Then the Port Status Change Events will be blocked until all status change bits(CEC is one of the change bits) are cleared('0') (refer to xHCI spec 4.19.2). Otherwise, the port status change event for this root port will not be generated anymore, then root port would look like dead for user and can't be recovered until a Host Controller Reset(HCRST). This patch is to check CEC bit in PORTSC in xhci_get_port_status() and set a Config Error in the return status if CEC is set. This will cause a ClearPortFeature request, where CEC bit is cleared in xhci_clear_port_change_bit(). [The commit log is based on initial Marvell patch posted at http://marc.info/?l=linux-kernel&m=142323612321434&w=2] Reported-by: Gregory CLEMENT Signed-off-by: Lu Baolu Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 93fe089cd51a..b9e16abb0fab 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -383,6 +383,10 @@ static void xhci_clear_port_change_bit(struct xhci_hcd *xhci, u16 wValue, status = PORT_PLC; port_change_bit = "link state"; break; + case USB_PORT_FEAT_C_PORT_CONFIG_ERROR: + status = PORT_CEC; + port_change_bit = "config error"; + break; default: /* Should never happen */ return; @@ -584,6 +588,8 @@ static u32 xhci_get_port_status(struct usb_hcd *hcd, status |= USB_PORT_STAT_C_LINK_STATE << 16; if ((raw_port_status & PORT_WRC)) status |= USB_PORT_STAT_C_BH_RESET << 16; + if ((raw_port_status & PORT_CEC)) + status |= USB_PORT_STAT_C_CONFIG_ERROR << 16; } if (hcd->speed != HCD_USB3) { @@ -999,6 +1005,7 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, case USB_PORT_FEAT_C_OVER_CURRENT: case USB_PORT_FEAT_C_ENABLE: case USB_PORT_FEAT_C_PORT_LINK_STATE: + case USB_PORT_FEAT_C_PORT_CONFIG_ERROR: xhci_clear_port_change_bit(xhci, wValue, wIndex, port_array[wIndex], temp); break; @@ -1063,7 +1070,7 @@ int xhci_hub_status_data(struct usb_hcd *hcd, char *buf) */ status = bus_state->resuming_ports; - mask = PORT_CSC | PORT_PEC | PORT_OCC | PORT_PLC | PORT_WRC; + mask = PORT_CSC | PORT_PEC | PORT_OCC | PORT_PLC | PORT_WRC | PORT_CEC; spin_lock_irqsave(&xhci->lock, flags); /* For each port, did anything change? If so, set that bit in buf. */ From c1f3c3bf135c3f43c4a0db789645e9b0886241fc Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 23 Mar 2015 18:27:42 +0200 Subject: [PATCH 212/236] usb: xhci: apply XHCI_AVOID_BEI quirk to all Intel xHCI controllers commit 227a4fd801c8a9fa2c4700ab98ec1aec06e3b44d upstream. When a device with an isochronous endpoint is plugged into the Intel xHCI host controller, and the driver submits multiple frames per URB, the xHCI driver will set the Block Event Interrupt (BEI) flag on all but the last TD for the URB. This causes the host controller to place an event on the event ring, but not send an interrupt. When the last TD for the URB completes, BEI is cleared, and we get an interrupt for the whole URB. However, under Intel xHCI host controllers, if the event ring is full of events from transfers with BEI set, an "Event Ring is Full" event will be posted to the last entry of the event ring, but no interrupt is generated. Host will cease all transfer and command executions and wait until software completes handling the pending events in the event ring. That means xHC stops, but event of "event ring is full" is not notified. As the result, the xHC looks like dead to user. This patch is to apply XHCI_AVOID_BEI quirk to Intel xHC devices. And it should be backported to kernels as old as 3.0, that contains the commit 69e848c2090a ("Intel xhci: Support EHCI/xHCI port switching."). Signed-off-by: Lu Baolu Tested-by: Alistair Grant Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 73c43e5e231b..eb3399f4c1ed 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -108,6 +108,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) if (pdev->vendor == PCI_VENDOR_ID_INTEL) { xhci->quirks |= XHCI_LPM_SUPPORT; xhci->quirks |= XHCI_INTEL_HOST; + xhci->quirks |= XHCI_AVOID_BEI; } if (pdev->vendor == PCI_VENDOR_ID_INTEL && pdev->device == PCI_DEVICE_ID_INTEL_PANTHERPOINT_XHCI) { @@ -123,7 +124,6 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) * PPT chipsets. */ xhci->quirks |= XHCI_SPURIOUS_REBOOT; - xhci->quirks |= XHCI_AVOID_BEI; } if (pdev->vendor == PCI_VENDOR_ID_INTEL && (pdev->device == PCI_DEVICE_ID_INTEL_LYNXPOINT_XHCI || From 9185aae67a63b81187f28225137595208895b5c1 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Fri, 13 Mar 2015 14:51:51 +0100 Subject: [PATCH 213/236] tty: serial: fsl_lpuart: clear receive flag on FIFO flush commit 8e4934c6d6c659e22b1b746af4196683e77ce6ca upstream. When the receiver was enabled during startup, a character could have been in the FIFO when the UART get initially used. The driver configures the (receive) watermark level, and flushes the FIFO. However, the receive flag (RDRF) could still be set at that stage (as mentioned in the register description of UARTx_RWFIFO). This leads to an interrupt which won't be handled properly in interrupt mode: The receive interrupt function lpuart_rxint checks the FIFO count, which is 0 at that point (due to the flush during initialization). The problem does not manifest when using DMA to receive characters. Fix this situation by explicitly read the status register, which leads to clearing of the RDRF flag. Due to the flush just after the status flag read, a explicit data read is not to required. Signed-off-by: Stefan Agner Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/fsl_lpuart.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 175f123f4f09..501c465feb59 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -362,6 +362,9 @@ static void lpuart_setup_watermark(struct lpuart_port *sport) writeb(val | UARTPFIFO_TXFE | UARTPFIFO_RXFE, sport->port.membase + UARTPFIFO); + /* explicitly clear RDRF */ + readb(sport->port.membase + UARTSR1); + /* flush Tx and Rx FIFO */ writeb(UARTCFIFO_TXFLUSH | UARTCFIFO_RXFLUSH, sport->port.membase + UARTCFIFO); From 4a673d7acafe13f7ff23dd292ff185623517bf2c Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Fri, 16 Jan 2015 15:05:39 -0500 Subject: [PATCH 214/236] n_tty: Fix read buffer overwrite when no newline commit fb5ef9e7da39968fec6d6f37f20a23d23740c75e upstream. In canon mode, the read buffer head will advance over the buffer tail if the input > 4095 bytes without receiving a line termination char. Discard additional input until a line termination is received. Before evaluating for overflow, the 'room' value is normalized for I_PARMRK and 1 byte is reserved for line termination (even in !icanon mode, in case the mode is switched). The following table shows the transform: actual buffer | 'room' value before overflow calc space avail | !I_PARMRK | I_PARMRK -------------------------------------------------- 0 | -1 | -1 1 | 0 | 0 2 | 1 | 0 3 | 2 | 0 4+ | 3 | 1 When !icanon or when icanon and the read buffer contains newlines, normalized 'room' values of -1 and 0 are clamped to 0, and 'overflow' is 0, so read_head is not adjusted and the input i/o loop exits (setting no_room if called from flush_to_ldisc()). No input is discarded since the reader does have input available to read which ensures forward progress. When icanon and the read buffer does not contain newlines and the normalized 'room' value is 0, then overflow and room are reset to 1, so that the i/o loop will process the next input char normally (except for parity errors which are ignored). Thus, erasures, signalling chars, 7-bit mode, etc. will continue to be handled properly. If the input char processed was not a line termination char, then the canon_head index will not have advanced, so the normalized 'room' value will now be -1 and 'overflow' will be set, which indicates the read_head can safely be reset, effectively erasing the last char processed. If the input char processed was a line termination, then the canon_head index will have advanced, so 'overflow' is cleared to 0, the read_head is not reset, and 'room' is cleared to 0, which exits the i/o loop (because the reader now have input available to read which ensures forward progress). Note that it is possible for a line termination to be received, and for the reader to copy the line to the user buffer before the input i/o loop is ready to process the next input char. This is why the i/o loop recomputes the room/overflow state with every input char while handling overflow. Finally, if the input data was processed without receiving a line termination (so that overflow is still set), the pty driver must receive a write wakeup. A pty writer may be waiting to write more data in n_tty_write() but without unthrottling here that wakeup will not arrive, and forward progress will halt. (Normally, the pty writer is woken when the reader reads data out of the buffer and more space become available). Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman (backported from commit fb5ef9e7da39968fec6d6f37f20a23d23740c75e) Signed-off-by: Joseph Salisbury --- drivers/tty/n_tty.c | 106 +++++++++++++++++++++++++++++++++----------- 1 file changed, 79 insertions(+), 27 deletions(-) diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 850e232d086e..8ab46ad40f28 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -247,8 +247,6 @@ static void n_tty_write_wakeup(struct tty_struct *tty) static void n_tty_check_throttle(struct tty_struct *tty) { - if (tty->driver->type == TTY_DRIVER_TYPE_PTY) - return; /* * Check the remaining room for the input canonicalization * mode. We don't want to throttle the driver if we're in @@ -1512,23 +1510,6 @@ n_tty_receive_char_lnext(struct tty_struct *tty, unsigned char c, char flag) n_tty_receive_char_flagged(tty, c, flag); } -/** - * n_tty_receive_buf - data receive - * @tty: terminal device - * @cp: buffer - * @fp: flag buffer - * @count: characters - * - * Called by the terminal driver when a block of characters has - * been received. This function must be called from soft contexts - * not from interrupt context. The driver is responsible for making - * calls one at a time and in order (or using flush_to_ldisc) - * - * n_tty_receive_buf()/producer path: - * claims non-exclusive termios_rwsem - * publishes read_head and canon_head - */ - static void n_tty_receive_buf_real_raw(struct tty_struct *tty, const unsigned char *cp, char *fp, int count) @@ -1684,24 +1665,85 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp, } } +/** + * n_tty_receive_buf_common - process input + * @tty: device to receive input + * @cp: input chars + * @fp: flags for each char (if NULL, all chars are TTY_NORMAL) + * @count: number of input chars in @cp + * + * Called by the terminal driver when a block of characters has + * been received. This function must be called from soft contexts + * not from interrupt context. The driver is responsible for making + * calls one at a time and in order (or using flush_to_ldisc) + * + * Returns the # of input chars from @cp which were processed. + * + * In canonical mode, the maximum line length is 4096 chars (including + * the line termination char); lines longer than 4096 chars are + * truncated. After 4095 chars, input data is still processed but + * not stored. Overflow processing ensures the tty can always + * receive more input until at least one line can be read. + * + * In non-canonical mode, the read buffer will only accept 4095 chars; + * this provides the necessary space for a newline char if the input + * mode is switched to canonical. + * + * Note it is possible for the read buffer to _contain_ 4096 chars + * in non-canonical mode: the read buffer could already contain the + * maximum canon line of 4096 chars when the mode is switched to + * non-canonical. + * + * n_tty_receive_buf()/producer path: + * claims non-exclusive termios_rwsem + * publishes commit_head or canon_head + */ static int n_tty_receive_buf_common(struct tty_struct *tty, const unsigned char *cp, char *fp, int count, int flow) { struct n_tty_data *ldata = tty->disc_data; - int room, n, rcvd = 0; + int room, n, rcvd = 0, overflow; down_read(&tty->termios_rwsem); while (1) { - room = receive_room(tty); + /* + * When PARMRK is set, each input char may take up to 3 chars + * in the read buf; reduce the buffer space avail by 3x + * + * If we are doing input canonicalization, and there are no + * pending newlines, let characters through without limit, so + * that erase characters will be handled. Other excess + * characters will be beeped. + * + * paired with store in *_copy_from_read_buf() -- guarantees + * the consumer has loaded the data in read_buf up to the new + * read_tail (so this producer will not overwrite unread data) + */ + size_t tail = ldata->read_tail; + + room = N_TTY_BUF_SIZE - (ldata->read_head - tail); + if (I_PARMRK(tty)) + room = (room + 2) / 3; + room--; + if (room <= 0) { + overflow = ldata->icanon && ldata->canon_head == tail; + if (overflow && room < 0) + ldata->read_head--; + room = overflow; + ldata->no_room = flow && !room; + } else + overflow = 0; + n = min(count, room); - if (!n) { - if (flow && !room) - ldata->no_room = 1; + if (!n) break; - } - __receive_buf(tty, cp, fp, n); + + /* ignore parity errors if handling overflow */ + if (!overflow || !fp || *fp != TTY_PARITY) + __receive_buf(tty, cp, fp, n); + cp += n; if (fp) fp += n; @@ -1710,7 +1752,17 @@ n_tty_receive_buf_common(struct tty_struct *tty, const unsigned char *cp, } tty->receive_room = room; - n_tty_check_throttle(tty); + + /* Unthrottle if handling overflow on pty */ + if (tty->driver->type == TTY_DRIVER_TYPE_PTY) { + if (overflow) { + tty_set_flow_change(tty, TTY_UNTHROTTLE_SAFE); + tty_unthrottle_safe(tty); + __tty_set_flow_change(tty, 0); + } + } else + n_tty_check_throttle(tty); + up_read(&tty->termios_rwsem); return rcvd; From 8b397872340adb46737dc2002ade846834d32cb1 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Wed, 4 Feb 2015 13:10:26 +0000 Subject: [PATCH 215/236] cifs: smb2_clone_range() - exit on unhandled error commit 2477bc58d49edb1c0baf59df7dc093dce682af2b upstream. While attempting to clone a file on a samba server, we receive a STATUS_INVALID_DEVICE_REQUEST. This is mapped to -EOPNOTSUPP which isn't handled in smb2_clone_range(). We end up looping in the while loop making same call to the samba server over and over again. The proposed fix is to exit and return the error value when encountered with an unhandled error. Signed-off-by: Sachin Prabhu Signed-off-by: Steve French Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 34a17d425be6..30f3eb5bc022 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -630,7 +630,8 @@ smb2_clone_range(const unsigned int xid, /* No need to change MaxChunks since already set to 1 */ chunk_sizes_updated = true; - } + } else + goto cchunk_out; } cchunk_out: From db6312ab779380004590b29612ce84af981ca374 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 13 Mar 2015 14:20:29 +0100 Subject: [PATCH 216/236] cifs: fix use-after-free bug in find_writable_file commit e1e9bda22d7ddf88515e8fe401887e313922823e upstream. Under intermittent network outages, find_writable_file() is susceptible to the following race condition, which results in a user-after-free in the cifs_writepages code-path: Thread 1 Thread 2 ======== ======== inv_file = NULL refind = 0 spin_lock(&cifs_file_list_lock) // invalidHandle found on openFileList inv_file = open_file // inv_file->count currently 1 cifsFileInfo_get(inv_file) // inv_file->count = 2 spin_unlock(&cifs_file_list_lock); cifs_reopen_file() cifs_close() // fails (rc != 0) ->cifsFileInfo_put() spin_lock(&cifs_file_list_lock) // inv_file->count = 1 spin_unlock(&cifs_file_list_lock) spin_lock(&cifs_file_list_lock); list_move_tail(&inv_file->flist, &cifs_inode->openFileList); spin_unlock(&cifs_file_list_lock); cifsFileInfo_put(inv_file); ->spin_lock(&cifs_file_list_lock) // inv_file->count = 0 list_del(&cifs_file->flist); // cleanup!! kfree(cifs_file); spin_unlock(&cifs_file_list_lock); spin_lock(&cifs_file_list_lock); ++refind; // refind = 1 goto refind_writable; At this point we loop back through with an invalid inv_file pointer and a refind value of 1. On second pass, inv_file is not overwritten on openFileList traversal, and is subsequently dereferenced. Signed-off-by: David Disseldorp Reviewed-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0218a9b23b38..40ddb6e93912 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1821,6 +1821,7 @@ refind_writable: cifsFileInfo_put(inv_file); spin_lock(&cifs_file_list_lock); ++refind; + inv_file = NULL; goto refind_writable; } } From c9fda3f26cd849651cd4592d910a211979e3bff6 Mon Sep 17 00:00:00 2001 From: John Soni Jose Date: Thu, 12 Feb 2015 06:45:47 +0530 Subject: [PATCH 217/236] be2iscsi: Fix kernel panic when device initialization fails commit 2e7cee027b26cbe7e6685a7a14bd2850bfe55d33 upstream. Kernel panic was happening as iscsi_host_remove() was called on a host which was not yet added. Signed-off-by: John Soni Jose Reviewed-by: Mike Christie Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/be2iscsi/be_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index 953bd0bfdf0d..19ddd43a00cf 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -5684,9 +5684,9 @@ free_port: hba_free: if (phba->msix_enabled) pci_disable_msix(phba->pcidev); - iscsi_host_remove(phba->shost); pci_dev_put(phba->pcidev); iscsi_host_free(phba->shost); + pci_set_drvdata(pcidev, NULL); disable_pci: pci_disable_device(pcidev); return ret; From c82bd1171466bac9d9ab5c33896008bbdaf0aeb1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Apr 2015 17:00:32 -0400 Subject: [PATCH 218/236] ocfs2: _really_ sync the right range commit 64b4e2526d1cf6e6a4db6213d6e2b6e6ab59479a upstream. "ocfs2 syncs the wrong range" had been broken; prior to it the code was doing the wrong thing in case of O_APPEND, all right, but _after_ it we were syncing the wrong range in 100% cases. *ppos, aka iocb->ki_pos is incremented prior to that point, so we are always doing sync on the area _after_ the one we'd written to. Spotted by Joseph Qi back in January; unfortunately, I'd missed his mail back then ;-/ Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/file.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 51632c40e896..7fe30f655aa5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2391,10 +2391,14 @@ out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); + if (unlikely(written <= 0)) + goto no_sync; + if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || ((file->f_flags & O_DIRECT) && !direct_io)) { - ret = filemap_fdatawrite_range(file->f_mapping, *ppos, - *ppos + count - 1); + ret = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); if (ret < 0) written = ret; @@ -2407,10 +2411,12 @@ out_dio: } if (!ret) - ret = filemap_fdatawait_range(file->f_mapping, *ppos, - *ppos + count - 1); + ret = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } +no_sync: /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that From 6642619b3a74771f7ace0f0a653dde628815eaeb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 6 Apr 2015 17:57:44 -0400 Subject: [PATCH 219/236] ioctx_alloc(): fix vma (and file) leak on failure commit deeb8525f9bcea60f5e86521880c1161de7a5829 upstream. If we fail past the aio_setup_ring(), we need to destroy the mapping. We don't need to care about anybody having found ctx, or added requests to it, since the last failure exit is exactly the failure to make ctx visible to lookups. Reproducer (based on one by Joe Mario ): void count(char *p) { char s[80]; printf("%s: ", p); fflush(stdout); sprintf(s, "/bin/cat /proc/%d/maps|/bin/fgrep -c '/[aio] (deleted)'", getpid()); system(s); } int main() { io_context_t *ctx; int created, limit, i, destroyed; FILE *f; count("before"); if ((f = fopen("/proc/sys/fs/aio-max-nr", "r")) == NULL) perror("opening aio-max-nr"); else if (fscanf(f, "%d", &limit) != 1) fprintf(stderr, "can't parse aio-max-nr\n"); else if ((ctx = calloc(limit, sizeof(io_context_t))) == NULL) perror("allocating aio_context_t array"); else { for (i = 0, created = 0; i < limit; i++) { if (io_setup(1000, ctx + created) == 0) created++; } for (i = 0, destroyed = 0; i < created; i++) if (io_destroy(ctx[i]) == 0) destroyed++; printf("created %d, failed %d, destroyed %d\n", created, limit - created, destroyed); count("after"); } } Found-by: Joe Mario Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/aio.c b/fs/aio.c index 2f7e8c2e3e76..3241659491b1 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -719,6 +719,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: + atomic_set(&ctx->dead, 1); + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); aio_free_ring(ctx); err: mutex_unlock(&ctx->ring_lock); From 07d3fa2cdb860b7b17a7d8ce1c2d53841c91aa71 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Apr 2015 02:47:27 -0500 Subject: [PATCH 220/236] iscsi target: fix oops when adding reject pdu commit b815fc12d4dd2b5586184fb4f867caff05a810d4 upstream. This fixes a oops due to a double list add when adding a reject PDU for iscsit_allocate_iovecs allocation failures. The cmd has already been added to the conn_cmd_list in iscsit_setup_scsi_cmd, so this has us call iscsit_reject_cmd. Note that for ERL0 the reject PDU is not actually sent, so this patch is not completely tested. Just verified we do not oops. The problem is the add reject functions return -1 which is returned all the way up to iscsi_target_rx_thread which for ERL0 will drop the connection. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index e168a63e77ea..b61c555a5a8f 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -1165,7 +1165,7 @@ iscsit_handle_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd, * traditional iSCSI block I/O. */ if (iscsit_allocate_iovecs(cmd) < 0) { - return iscsit_add_reject_cmd(cmd, + return iscsit_reject_cmd(cmd, ISCSI_REASON_BOOKMARK_NO_RESOURCES, buf); } immed_data = cmd->immediate_data; From 60b9edff89a168839693a793d5847b01f3d2aefe Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 10 Dec 2014 12:35:34 -0300 Subject: [PATCH 221/236] sh_veu: v4l2_dev wasn't set commit ab3120300be067a2d41a027c41db0b2c662ab200 upstream. The v4l2_dev field of struct video_device must be set correctly. This was never done for this driver, so no video nodes were created anymore. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/platform/sh_veu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/platform/sh_veu.c b/drivers/media/platform/sh_veu.c index 744e43b480bc..f698e322a1cd 100644 --- a/drivers/media/platform/sh_veu.c +++ b/drivers/media/platform/sh_veu.c @@ -1183,6 +1183,7 @@ static int sh_veu_probe(struct platform_device *pdev) } *vdev = sh_veu_videodev; + vdev->v4l2_dev = &veu->v4l2_dev; spin_lock_init(&veu->lock); mutex_init(&veu->fop_lock); vdev->lock = &veu->fop_lock; From 3f9fe6e3fbdf1d3c6ffc874ec93f1eb59eaaee77 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 4 Mar 2015 05:55:21 -0800 Subject: [PATCH 222/236] media: s5p-mfc: fix mmap support for 64bit arch commit 05b676ab42f624425d5f6519276e506b812fa058 upstream. TASK_SIZE is depends on the systems architecture (32 or 64 bits) and it should not be used for defining offset boundary for mmaping buffers for CAPTURE and OUTPUT queues. This patch fixes support for MMAP calls on the CAPTURE queue on 64bit architectures (like ARM64). Signed-off-by: Marek Szyprowski Signed-off-by: Kamil Debski Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/platform/s5p-mfc/s5p_mfc_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_common.h b/drivers/media/platform/s5p-mfc/s5p_mfc_common.h index f723f1f2f578..ab851278d9d0 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc_common.h +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_common.h @@ -30,7 +30,7 @@ /* Offset base used to differentiate between CAPTURE and OUTPUT * while mmaping */ -#define DST_QUEUE_OFF_BASE (TASK_SIZE / 2) +#define DST_QUEUE_OFF_BASE (1 << 30) #define MFC_BANK1_ALLOC_CTX 0 #define MFC_BANK2_ALLOC_CTX 1 From 2a9eb844a4c126b7e975a086f99f35699e88c011 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 3 Feb 2015 08:55:58 -0500 Subject: [PATCH 223/236] net: rds: use correct size for max unacked packets and bytes commit db27ebb111e9f69efece08e4cb6a34ff980f8896 upstream. Max unacked packets/bytes is an int while sizeof(long) was used in the sysctl table. This means that when they were getting read we'd also leak kernel memory to userspace along with the timeout values. Signed-off-by: Sasha Levin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/rds/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index b5cb2aa08f33..35773ad6d23d 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -71,14 +71,14 @@ static struct ctl_table rds_sysctl_rds_table[] = { { .procname = "max_unacked_packets", .data = &rds_sysctl_max_unacked_packets, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_unacked_bytes", .data = &rds_sysctl_max_unacked_bytes, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, From ecca64226ce2960280921e09ae33e90f82b5c408 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 23 Jan 2015 20:47:00 -0500 Subject: [PATCH 224/236] net: llc: use correct size for sysctl timeout entries commit 6b8d9117ccb4f81b1244aafa7bc70ef8fa45fc49 upstream. The timeout entries are sizeof(int) rather than sizeof(long), which means that when they were getting read we'd also leak kernel memory to userspace along with the timeout values. Signed-off-by: Sasha Levin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/llc/sysctl_net_llc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c index 612a5ddaf93b..799bafc2af39 100644 --- a/net/llc/sysctl_net_llc.c +++ b/net/llc/sysctl_net_llc.c @@ -18,28 +18,28 @@ static struct ctl_table llc2_timeout_table[] = { { .procname = "ack", .data = &sysctl_llc2_ack_timeout, - .maxlen = sizeof(long), + .maxlen = sizeof(sysctl_llc2_ack_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "busy", .data = &sysctl_llc2_busy_timeout, - .maxlen = sizeof(long), + .maxlen = sizeof(sysctl_llc2_busy_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "p", .data = &sysctl_llc2_p_timeout, - .maxlen = sizeof(long), + .maxlen = sizeof(sysctl_llc2_p_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "rej", .data = &sysctl_llc2_rej_timeout, - .maxlen = sizeof(long), + .maxlen = sizeof(sysctl_llc2_rej_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, From 01c343844ceb1d4a1dc25763b633f73e05fe93e7 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Wed, 18 Mar 2015 16:51:37 +0200 Subject: [PATCH 225/236] IB/mlx4: Saturate RoCE port PMA counters in case of overflow commit 61a3855bb726cbb062ef02a31a832dea455456e0 upstream. For RoCE ports, we set the u32 PMA values based on u64 HCA counters. In case of overflow, according to the IB spec, we have to saturate a counter to its max value, do that. Fixes: c37791349cc7 ('IB/mlx4: Support PMA counters for IBoE') Signed-off-by: Majd Dibbiny Signed-off-by: Eran Ben Elisha Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx4/mad.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index f2a3f48107e7..2592ab5f21b1 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -64,6 +64,14 @@ enum { #define GUID_TBL_BLK_NUM_ENTRIES 8 #define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) +/* Counters should be saturate once they reach their maximum value */ +#define ASSIGN_32BIT_COUNTER(counter, value) do {\ + if ((value) > U32_MAX) \ + counter = cpu_to_be32(U32_MAX); \ + else \ + counter = cpu_to_be32(value); \ +} while (0) + struct mlx4_mad_rcv_buf { struct ib_grh grh; u8 payload[256]; @@ -730,10 +738,14 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, static void edit_counter(struct mlx4_counter *cnt, struct ib_pma_portcounters *pma_cnt) { - pma_cnt->port_xmit_data = cpu_to_be32((be64_to_cpu(cnt->tx_bytes)>>2)); - pma_cnt->port_rcv_data = cpu_to_be32((be64_to_cpu(cnt->rx_bytes)>>2)); - pma_cnt->port_xmit_packets = cpu_to_be32(be64_to_cpu(cnt->tx_frames)); - pma_cnt->port_rcv_packets = cpu_to_be32(be64_to_cpu(cnt->rx_frames)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data, + (be64_to_cpu(cnt->tx_bytes) >> 2)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data, + (be64_to_cpu(cnt->rx_bytes) >> 2)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets, + be64_to_cpu(cnt->tx_frames)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets, + be64_to_cpu(cnt->rx_frames)); } static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, From 5c43c539346972b9a1d5f38dc62db5a39997488d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 19 Apr 2015 10:11:58 +0200 Subject: [PATCH 226/236] Linux 3.14.39 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f09e19d2cc4f..b40845e11b84 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 14 -SUBLEVEL = 38 +SUBLEVEL = 39 EXTRAVERSION = NAME = Remembering Coco From 77dfdd746390b9230ac5420f13dd363bcdac6e98 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:10:30 +0400 Subject: [PATCH 227/236] fuse: Connection bit for enabling writeback Off (0) by default. Will be used in the next patches and will be turned on at the very end. Signed-off-by: Maxim Patlasov Signed-off-by: Pavel Emelyanov Signed-off-by: Miklos Szeredi (cherry picked from commit d5cd66c58edf10a7ee786659994595fd43995aab) Signed-off-by: Alex Shi --- fs/fuse/fuse_i.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2da5db2c8bdb..374a8be014fd 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -480,6 +480,9 @@ struct fuse_conn { /** Set if bdi is valid */ unsigned bdi_initialized:1; + /** write-back cache policy (default is write-through) */ + unsigned writeback_cache:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction From 4a40d2a3f50fd5721e3c8896e077d3f62f5b73be Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:10:04 +0400 Subject: [PATCH 228/236] fuse: Linking file to inode helper When writeback is ON every writeable file should be in per-inode write list, not only mmap-ed ones. Thus introduce a helper for this linkage. Signed-off-by: Maxim Patlasov Signed-off-by: Pavel Emelyanov Signed-off-by: Miklos Szeredi (cherry picked from commit 650b22b941fa03590c4a3671e79ec2c96ea59e9a) Signed-off-by: Alex Shi --- fs/fuse/file.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d8a60270581c..f0183869252f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -188,6 +188,22 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, } EXPORT_SYMBOL_GPL(fuse_do_open); +static void fuse_link_write_file(struct file *file) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fc->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fc->lock); +} + void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; @@ -1942,20 +1958,9 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_file *ff = file->private_data; - /* - * file may be written through mmap, so chain it onto the - * inodes's write_file list - */ - spin_lock(&fc->lock); - if (list_empty(&ff->write_entry)) - list_add(&ff->write_entry, &fi->write_files); - spin_unlock(&fc->lock); - } + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + fuse_link_write_file(file); + file_accessed(file); vma->vm_ops = &fuse_file_vm_ops; return 0; From fb2667e92663bb09a3d7e79de4103b7ce296b589 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:10:16 +0400 Subject: [PATCH 229/236] fuse: Prepare to handle short reads A helper which gets called when read reports less bytes than was requested. See patch "trust kernel i_size only" for details. Signed-off-by: Maxim Patlasov Signed-off-by: Pavel Emelyanov Signed-off-by: Miklos Szeredi (cherry picked from commit a92adc824ed5feaa2d4f7029f21170f574987aee) Signed-off-by: Alex Shi --- fs/fuse/file.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f0183869252f..4646157eb98c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -671,6 +671,15 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, spin_unlock(&fc->lock); } +static void fuse_short_read(struct fuse_req *req, struct inode *inode, + u64 attr_ver) +{ + size_t num_read = req->out.args[0].size; + + loff_t pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, attr_ver); +} + static int fuse_readpage(struct file *file, struct page *page) { struct fuse_io_priv io = { .async = 0, .file = file }; @@ -708,18 +717,18 @@ static int fuse_readpage(struct file *file, struct page *page) req->page_descs[0].length = count; num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; - fuse_put_request(fc, req); if (!err) { /* * Short read means EOF. If file size is larger, truncate it */ if (num_read < count) - fuse_read_update_size(inode, pos + num_read, attr_ver); + fuse_short_read(req, inode, attr_ver); SetPageUptodate(page); } + fuse_put_request(fc, req); fuse_invalidate_atime(inode); out: unlock_page(page); @@ -742,13 +751,9 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) /* * Short read means EOF. If file size is larger, truncate it */ - if (!req->out.h.error && num_read < count) { - loff_t pos; + if (!req->out.h.error && num_read < count) + fuse_short_read(req, inode, req->misc.read.attr_ver); - pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, - req->misc.read.attr_ver); - } fuse_invalidate_atime(inode); } From 628bfe8bf63df0e83632598115cb91c7c52a115a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:10:46 +0400 Subject: [PATCH 230/236] fuse: Trust kernel i_size only Make fuse think that when writeback is on the inode's i_size is always up-to-date and not update it with the value received from the userspace. This is done because the page cache code may update i_size without letting the FS know. This assumption implies fixing the previously introduced short-read helper -- when a short read occurs the 'hole' is filled with zeroes. fuse_file_fallocate() is also fixed because now we should keep i_size up to date, so it must be updated if FUSE_FALLOCATE request succeeded. Signed-off-by: Maxim V. Patlasov Signed-off-by: Miklos Szeredi (cherry picked from commit 8373200b124d03de7fa2e99be56de8642e604e9e) Signed-off-by: Alex Shi --- fs/fuse/dir.c | 13 +++++++++++-- fs/fuse/file.c | 21 +++++++++++++++++++-- fs/fuse/inode.c | 11 +++++++++-- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d53a7d91eb07..6fc83391e2e3 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -870,6 +870,11 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see the comment in fuse_change_attributes() */ + if (fc->writeback_cache && S_ISREG(inode->i_mode)) + attr->size = i_size_read(inode); stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; @@ -1611,6 +1616,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; + bool is_wb = fc->writeback_cache; loff_t oldsize; int err; @@ -1682,7 +1688,9 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, fuse_change_attributes_common(inode, &outarg.attr, attr_timeout(&outarg)); oldsize = inode->i_size; - i_size_write(inode, outarg.attr.size); + /* see the comment in fuse_change_attributes() */ + if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) + i_size_write(inode, outarg.attr.size); if (is_truncate) { /* NOTE: this may release/reacquire fc->lock */ @@ -1694,7 +1702,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, * Only call invalidate_inode_pages2() after removing * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ - if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if ((is_truncate || !is_wb) && + S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4646157eb98c..5b863c08d0b9 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -675,9 +675,26 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, u64 attr_ver) { size_t num_read = req->out.args[0].size; + struct fuse_conn *fc = get_fuse_conn(inode); - loff_t pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, attr_ver); + if (fc->writeback_cache) { + /* + * A hole in a file. Some data after the hole are in page cache, + * but have not reached the client fs yet. So, the hole is not + * present there. + */ + int i; + int start_idx = num_read >> PAGE_CACHE_SHIFT; + size_t off = num_read & (PAGE_CACHE_SIZE - 1); + + for (i = start_idx; i < req->num_pages; i++) { + zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE); + off = 0; + } + } else { + loff_t pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, attr_ver); + } } static int fuse_readpage(struct file *file, struct page *page) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73f6bcb44ea8..22a78a6cc2a5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -197,6 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + bool is_wb = fc->writeback_cache; loff_t oldsize; struct timespec old_mtime; @@ -211,10 +212,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; - i_size_write(inode, attr->size); + /* + * In case of writeback_cache enabled, the cached writes beyond EOF + * extend local i_size without keeping userspace server in sync. So, + * attr->size coming from server can be stale. We cannot trust it. + */ + if (!is_wb || !S_ISREG(inode->i_mode)) + i_size_write(inode, attr->size); spin_unlock(&fc->lock); - if (S_ISREG(inode->i_mode)) { + if (!is_wb && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { From 9d48c62fbcbc8ce7c9cad37119d12fe0be61300c Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 26 Dec 2013 19:51:11 +0400 Subject: [PATCH 231/236] fuse: Trust kernel i_mtime only Let the kernel maintain i_mtime locally: - clear S_NOCMTIME - implement i_op->update_time() - flush mtime on fsync and last close - update i_mtime explicitly on truncate and fallocate Fuse inode flag FUSE_I_MTIME_DIRTY serves as indication that local i_mtime should be flushed to the server eventually. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi (cherry picked from commit b0aa760652179072119582375f8dc896ed5b5dfd) Signed-off-by: Alex Shi --- fs/fuse/dir.c | 108 +++++++++++++++++++++++++++++++++++++++-------- fs/fuse/file.c | 30 +++++++++++-- fs/fuse/fuse_i.h | 6 ++- fs/fuse/inode.c | 13 ++++-- 4 files changed, 132 insertions(+), 25 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 6fc83391e2e3..b4adb3d97d37 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -873,8 +873,11 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct fuse_conn *fc = get_fuse_conn(inode); /* see the comment in fuse_change_attributes() */ - if (fc->writeback_cache && S_ISREG(inode->i_mode)) + if (fc->writeback_cache && S_ISREG(inode->i_mode)) { attr->size = i_size_read(inode); + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + } stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; @@ -1513,12 +1516,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); } -static bool update_mtime(unsigned ivalid) +static bool update_mtime(unsigned ivalid, bool trust_local_mtime) { /* Always update if mtime is explicitly set */ if (ivalid & ATTR_MTIME_SET) return true; + /* Or if kernel i_mtime is the official one */ + if (trust_local_mtime) + return true; + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) return false; @@ -1527,7 +1534,8 @@ static bool update_mtime(unsigned ivalid) return true; } -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) +static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_mtime) { unsigned ivalid = iattr->ia_valid; @@ -1546,11 +1554,11 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET)) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime) arg->valid |= FATTR_MTIME_NOW; } } @@ -1599,6 +1607,63 @@ void fuse_release_nowrite(struct inode *inode) spin_unlock(&fc->lock); } +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, + struct inode *inode, + struct fuse_setattr_in *inarg_p, + struct fuse_attr_out *outarg_p) +{ + req->in.h.opcode = FUSE_SETATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(*inarg_p); + req->in.args[0].value = inarg_p; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + else + req->out.args[0].size = sizeof(*outarg_p); + req->out.args[0].value = outarg_p; +} + +/* + * Flush inode->i_mtime to the server + */ +int fuse_flush_mtime(struct file *file, bool nofail) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req = NULL; + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + int err; + + if (nofail) { + req = fuse_get_req_nofail_nopages(fc, file); + } else { + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + } + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + + inarg.valid |= FATTR_MTIME; + inarg.mtime = inode->i_mtime.tv_sec; + inarg.mtimensec = inode->i_mtime.tv_nsec; + + fuse_setattr_fill(fc, req, inode, &inarg, &outarg); + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + if (!err) + clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + + return err; +} + /* * Set attributes, and at the same time refresh them. * @@ -1619,6 +1684,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, bool is_wb = fc->writeback_cache; loff_t oldsize; int err; + bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1647,7 +1713,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg); + iattr_to_fattr(attr, &inarg, trust_local_mtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1658,17 +1724,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } - req->in.h.opcode = FUSE_SETATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; - else - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; + fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -1685,6 +1741,12 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, } spin_lock(&fc->lock); + /* the kernel maintains i_mtime locally */ + if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { + inode->i_mtime = attr->ia_mtime; + clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + } + fuse_change_attributes_common(inode, &outarg.attr, attr_timeout(&outarg)); oldsize = inode->i_size; @@ -1915,6 +1977,17 @@ static int fuse_removexattr(struct dentry *entry, const char *name) return err; } +static int fuse_update_time(struct inode *inode, struct timespec *now, + int flags) +{ + if (flags & S_MTIME) { + inode->i_mtime = *now; + set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state); + BUG_ON(!S_ISREG(inode->i_mode)); + } + return 0; +} + static const struct inode_operations fuse_dir_inode_operations = { .lookup = fuse_lookup, .mkdir = fuse_mkdir, @@ -1954,6 +2027,7 @@ static const struct inode_operations fuse_common_inode_operations = { .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, .removexattr = fuse_removexattr, + .update_time = fuse_update_time, }; static const struct inode_operations fuse_symlink_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5b863c08d0b9..5916dc51599b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -308,6 +308,9 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) + fuse_flush_mtime(file, true); + fuse_release_common(file, FUSE_RELEASE); /* return value is ignored by VFS */ @@ -475,6 +478,12 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, fuse_sync_writes(inode); + if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { + int err = fuse_flush_mtime(file, false); + if (err) + goto out; + } + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -960,16 +969,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, return req->misc.write.out.size; } -void fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + bool ret = false; spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; - if (pos > inode->i_size) + if (pos > inode->i_size) { i_size_write(inode, pos); + ret = true; + } spin_unlock(&fc->lock); + + return ret; } static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, @@ -2873,8 +2887,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; /* we could have extended the file */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) - fuse_write_update_size(inode, offset + length); + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + bool changed = fuse_write_update_size(inode, offset + length); + + if (changed && fc->writeback_cache) { + struct fuse_inode *fi = get_fuse_inode(inode); + + inode->i_mtime = current_fs_time(inode->i_sb); + set_bit(FUSE_I_MTIME_DIRTY, &fi->state); + } + } if (mode & FALLOC_FL_PUNCH_HOLE) truncate_pagecache_range(inode, offset, offset + length - 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 374a8be014fd..1e6ad6d43051 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -119,6 +119,8 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, + /** i_mtime has been updated locally; a flush to userspace needed */ + FUSE_I_MTIME_DIRTY, }; struct fuse_conn; @@ -876,7 +878,9 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); -void fuse_write_update_size(struct inode *inode, loff_t pos); +bool fuse_write_update_size(struct inode *inode, loff_t pos); + +int fuse_flush_mtime(struct file *file, bool nofail); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 22a78a6cc2a5..338da5216e63 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -170,8 +170,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; + /* mtime from server may be stale due to local buffered write */ + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + } inode->i_ctime.tv_sec = attr->ctime; inode->i_ctime.tv_nsec = attr->ctimensec; @@ -250,6 +253,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); @@ -296,7 +301,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return NULL; if ((inode->i_state & I_NEW)) { - inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_flags |= S_NOATIME; + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; fuse_init_inode(inode, attr); From 6b1c8b7974158eabc822d1a01f528f95793219ae Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:24 +0200 Subject: [PATCH 232/236] fuse: trust kernel i_ctime only Let the kernel maintain i_ctime locally: update i_ctime explicitly on truncate, fallocate, open(O_TRUNC), setxattr, removexattr, link, rename, unlink. The inode flag I_DIRTY_SYNC serves as indication that local i_ctime should be flushed to the server eventually. The patch sets the flag and updates i_ctime in course of operations listed above. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi (cherry picked from commit 31f3267b4ba16b12fb9dd3b1953ea0f221cc2ab4) Signed-off-by: Alex Shi --- fs/fuse/dir.c | 22 ++++++++++++++++++++-- fs/fuse/inode.c | 6 ++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b4adb3d97d37..6d5e45c0db77 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -680,6 +680,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, req, dir, entry, S_IFLNK); } +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -714,6 +722,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -772,6 +781,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); + fuse_update_ctime(oldent->d_inode); fuse_invalidate_attr(olddir); if (olddir != newdir) @@ -781,6 +791,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + fuse_update_ctime(newent->d_inode); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -860,6 +871,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } @@ -877,6 +889,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); attr->mtime = inode->i_mtime.tv_sec; attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; } stat->dev = inode->i_sb->s_dev; @@ -1841,8 +1855,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1972,8 +1988,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 338da5216e63..8fe0b485908c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -174,9 +174,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; } - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -255,6 +255,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_size = attr->size; inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); From e03ae8b69bb899c5cdd0c594fa58a85cd180df1b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 16:43:44 +0200 Subject: [PATCH 233/236] fuse: add renameat2 support Support RENAME_EXCHANGE and RENAME_NOREPLACE flags on the userspace ABI. Signed-off-by: Miklos Szeredi (cherry picked from commit 1560c974dcd40a8d3f193283acd7cc6aee13dc13) Signed-off-by: Alex Shi Conflicts: fs/fuse/dir.c include/uapi/linux/fuse.h --- fs/fuse/dir.c | 24 ++++++++++++++++-------- fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 10 ++++++++++ 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 6d5e45c0db77..33dec83eccfb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -753,23 +753,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) { int err; - struct fuse_rename_in inarg; + struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req_nopages(fc); + struct fuse_req *req; + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); + memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); - req->in.h.opcode = FUSE_RENAME; + inarg.flags = flags; + req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(olddir); req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = argsize; req->in.args[0].value = &inarg; req->in.args[1].size = oldent->d_name.len + 1; req->in.args[1].value = oldent->d_name.name; @@ -783,12 +786,17 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(oldent->d_inode); fuse_update_ctime(oldent->d_inode); + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(newent->d_inode); + fuse_update_ctime(newent->d_inode); + } + fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) { + if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); fuse_update_ctime(newent->d_inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1e6ad6d43051..7cc58c976780 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -544,6 +544,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 60bb2f9f7b74..b77cf381a44b 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -93,6 +93,9 @@ * * 7.22 * - add FUSE_ASYNC_DIO + * + * 7.23 + * - add FUSE_RENAME2 request */ #ifndef _LINUX_FUSE_H @@ -343,6 +346,7 @@ enum fuse_opcode { FUSE_BATCH_FORGET = 42, FUSE_FALLOCATE = 43, FUSE_READDIRPLUS = 44, + FUSE_RENAME2 = 45, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -421,6 +425,12 @@ struct fuse_rename_in { uint64_t newdir; }; +struct fuse_rename2_in { + uint64_t newdir; + uint32_t flags; + uint32_t padding; +}; + struct fuse_link_in { uint64_t oldnodeid; }; From a519f1a554cdeea7aaddce8cfa177e453a273955 Mon Sep 17 00:00:00 2001 From: "Gary S. Robertson" Date: Mon, 13 Apr 2015 11:31:30 -0500 Subject: [PATCH 234/236] KVM:EVENTFD: fix implicit declaration on x86/_64 During compilation for x86 and x86_64, received the following warning: virt/kvm/eventfd.c:493:2: error: implicit declaration of function 'kvm_vcpu_request_scan_ioapic' [-Werror=implicit-function-declaration] kvm_vcpu_request_scan_ioapic(kvm); In the kernel.org linux-stable tree, commit 29f1b65 KVM:EVENTFD: Remove inclusion of irq.h by Christoffer Dall addresses this issue. This backport incudes those portions of that patch which are relevant in the 3.14 tree. Acked-by: Christoffer Dall Signed-off-by: Gary S. Robertson Signed-off-by: Kevin Hilman --- virt/kvm/eventfd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 0ab1411eb007..ce3ca4c54534 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -36,6 +36,9 @@ #include #include +#ifdef __KVM_HAVE_IOAPIC +#include "ioapic.h" +#endif #include "iodev.h" #ifdef CONFIG_HAVE_KVM_IRQFD From 69c11a1079700f80424805d8a40dabe436d37b46 Mon Sep 17 00:00:00 2001 From: "Gary S. Robertson" Date: Mon, 13 Apr 2015 11:31:31 -0500 Subject: [PATCH 235/236] KVM: VFIO: resolve redeclaration error on x86/_64 Compilation for x86 /x86_64 resulted in the following error: virt/kvm/vfio.c:238:30: error: static declaration of 'kvm_vfio_ops' follows non-static declaration static struct kvm_device_ops kvm_vfio_ops include/linux/kvm_host.h:1057:30: note: previous declaration of 'kvm_vfio_ops' was here extern struct kvm_device_ops kvm_vfio_ops; In the kernel.org linux-stable tree, commit 80ce163 KVM: VFIO: register kvm_device_ops dynamically by Will Deacon includes the removal of the declaration from include/linux/kvm_host.h, but that portion of the change does not seem to have made it into the 3.14 tree - while the rest of that commit does seem to have been backported. This patch resolves that omission to fix the x86 compilation problems. Acked-by: Christoffer Dall Signed-off-by: Gary S. Robertson Signed-off-by: Kevin Hilman --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 55d75a7cba73..bf933ba17529 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1054,7 +1054,6 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; -extern struct kvm_device_ops kvm_vfio_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT From 1dcd9896384ec3fe5c75829701d9babec7fe0d25 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 15 Jan 2015 16:42:14 +0000 Subject: [PATCH 236/236] arm64: respect mem= for EFI When booting with EFI, we acquire the EFI memory map after parsing the early params. This unfortuantely renders the option useless as we call memblock_enforce_memory_limit (which uses memblock_remove_range behind the scenes) before we've added any memblocks. We end up removing nothing, then adding all of memory later when efi_init calls reserve_regions. Instead, we can log the limit and apply this later when we do the rest of the memblock work in memblock_init, which should work regardless of the presence of EFI. At the same time we may as well move the early parameter into arm64's mm/init.c, close to arm64_memblock_init. Any memory which must be mapped (e.g. for use by EFI runtime services) must be mapped explicitly reather than relying on the linear mapping, which may be truncated as a result of a mem= option passed on the kernel command line. Signed-off-by: Mark Rutland Acked-by: Catalin Marinas Acked-by: Ard Biesheuvel Tested-by: Ard Biesheuvel Cc: Leif Lindholm Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 6083fe74b7bfffc2c7be8c711596608bda0cda6e) Signed-off-by: Alex Shi --- arch/arm64/kernel/setup.c | 19 ------------------- arch/arm64/mm/init.c | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 90c4c129f71a..dff33006322b 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -310,25 +310,6 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys) } } -/* - * Limit the memory size that was specified via FDT. - */ -static int __init early_mem(char *p) -{ - phys_addr_t limit; - - if (!p) - return 1; - - limit = memparse(p, &p) & PAGE_MASK; - pr_notice("Memory limited to %lldMB\n", limit >> 20); - - memblock_enforce_memory_limit(limit); - - return 0; -} -early_param("mem", early_mem); - static void __init request_standard_resources(void) { struct memblock_region *region; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index cc3339d0a276..2d8f884d7f92 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -124,10 +124,29 @@ static void arm64_memory_present(void) } #endif +static phys_addr_t memory_limit = (phys_addr_t)ULLONG_MAX; + +/* + * Limit the memory size that was specified via FDT. + */ +static int __init early_mem(char *p) +{ + if (!p) + return 1; + + memory_limit = memparse(p, &p) & PAGE_MASK; + pr_notice("Memory limited to %lldMB\n", memory_limit >> 20); + + return 0; +} +early_param("mem", early_mem); + void __init arm64_memblock_init(void) { u64 *reserve_map, base, size; + memblock_enforce_memory_limit(memory_limit); + /* * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock.