From d53f1cd82265050136064b5638235d69af1052dc Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Thu, 31 Oct 2013 09:09:17 +0800 Subject: [PATCH 01/94] ACPICA: Interpreter: Fix Store() when implicit conversion is not possible. commit 3f654bad3257427bea7ba1c4d43a23d99a03622b upstream. For the cases such as a store of a string to an existing package object, implement the store as a CopyObject(). This is a small departure from the ACPI specification which states that the control method should be aborted in this case. However, ASLTS suite depends on this behavior. Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki Signed-off-by: Kamal Mostafa --- drivers/acpi/acpica/exstore.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/acpica/exstore.c b/drivers/acpi/acpica/exstore.c index 90431f12f831..4ff37e8e0018 100644 --- a/drivers/acpi/acpica/exstore.c +++ b/drivers/acpi/acpica/exstore.c @@ -487,14 +487,33 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, default: ACPI_DEBUG_PRINT((ACPI_DB_EXEC, - "Storing %s (%p) directly into node (%p) with no implicit conversion\n", + "Storing [%s] (%p) directly into node [%s] (%p)" + " with no implicit conversion\n", acpi_ut_get_object_type_name(source_desc), - source_desc, node)); + source_desc, + acpi_ut_get_object_type_name(target_desc), + node)); - /* No conversions for all other types. Just attach the source object */ + /* + * No conversions for all other types. Directly store a copy of + * the source object. NOTE: This is a departure from the ACPI + * spec, which states "If conversion is impossible, abort the + * running control method". + * + * This code implements "If conversion is impossible, treat the + * Store operation as a CopyObject". + */ + status = + acpi_ut_copy_iobject_to_iobject(source_desc, &new_desc, + walk_state); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } - status = acpi_ns_attach_object(node, source_desc, - source_desc->common.type); + status = + acpi_ns_attach_object(node, new_desc, + new_desc->common.type); + acpi_ut_remove_reference(new_desc); break; } From 2415da8f65960e9816fe611a62f751d7770222fe Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Thu, 31 Oct 2013 09:09:41 +0800 Subject: [PATCH 02/94] ACPICA: DeRefOf operator: Update to fully resolve FieldUnit and BufferField refs. commit 63660e05ec719613b518547b40a1c501c10f0bc4 upstream. Previously, references to these objects were resolved only to the actual FieldUnit or BufferField object. The correct behavior is to resolve these references to an actual value. The problem is that DerefOf did not resolve these objects to actual values. An "Integer" object is simple, return the value. But a field in an operation region will require a read operation. For a BufferField, the appropriate data must be extracted from the parent buffer. NOTE: It appears that this issues is present in Windows7 but not Windows8. Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki Signed-off-by: Kamal Mostafa --- drivers/acpi/acpica/exoparg1.c | 37 ++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpica/exoparg1.c b/drivers/acpi/acpica/exoparg1.c index bbf01e9bf057..1fa1ad67e6b8 100644 --- a/drivers/acpi/acpica/exoparg1.c +++ b/drivers/acpi/acpica/exoparg1.c @@ -997,11 +997,40 @@ acpi_status acpi_ex_opcode_1A_0T_1R(struct acpi_walk_state *walk_state) acpi_namespace_node *) return_desc); + if (!return_desc) { + break; + } + + /* + * June 2013: + * buffer_fields/field_units require additional resolution + */ + switch (return_desc->common.type) { + case ACPI_TYPE_BUFFER_FIELD: + case ACPI_TYPE_LOCAL_REGION_FIELD: + case ACPI_TYPE_LOCAL_BANK_FIELD: + case ACPI_TYPE_LOCAL_INDEX_FIELD: + + status = + acpi_ex_read_data_from_field + (walk_state, return_desc, + &temp_desc); + if (ACPI_FAILURE(status)) { + goto cleanup; + } + + return_desc = temp_desc; + break; + + default: + + /* Add another reference to the object */ + + acpi_ut_add_reference + (return_desc); + break; + } } - - /* Add another reference to the object! */ - - acpi_ut_add_reference(return_desc); break; default: From 61da5e0d37785164724ebf9eb7a4a2c47652c406 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Thu, 31 Oct 2013 09:10:02 +0800 Subject: [PATCH 03/94] ACPICA: Return error if DerefOf resolves to a null package element. commit a50abf4842dd7d603a2ad6dcc7f1467fd2a66f03 upstream. Disallow the dereference of a reference (via index) to an uninitialized package element. Provides compatibility with other ACPI implementations. ACPICA BZ 1003. References: https://bugs.acpica.org/show_bug.cgi?id=431 Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki Signed-off-by: Kamal Mostafa --- drivers/acpi/acpica/exoparg1.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/acpica/exoparg1.c b/drivers/acpi/acpica/exoparg1.c index 1fa1ad67e6b8..c9f1a21adacf 100644 --- a/drivers/acpi/acpica/exoparg1.c +++ b/drivers/acpi/acpica/exoparg1.c @@ -969,10 +969,17 @@ acpi_status acpi_ex_opcode_1A_0T_1R(struct acpi_walk_state *walk_state) */ return_desc = *(operand[0]->reference.where); - if (return_desc) { - acpi_ut_add_reference - (return_desc); + if (!return_desc) { + /* + * Element is NULL, do not allow the dereference. + * This provides compatibility with other ACPI + * implementations. + */ + return_ACPI_STATUS + (AE_AML_UNINITIALIZED_ELEMENT); } + + acpi_ut_add_reference(return_desc); break; default: From e991e2b8a570367da1ca1a8cebc3daaff23a71f1 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Thu, 31 Oct 2013 09:10:22 +0800 Subject: [PATCH 04/94] ACPICA: Fix for a Store->ArgX when ArgX contains a reference to a field. commit 4be4be8fee2ee99a52f94f90d03d2f287ee1db86 upstream. This change fixes a problem where a Store operation to an ArgX object that contained a reference to a field object did not complete the automatic dereference and then write to the actual field object. Instead, the object type of the field object was inadvertently changed to match the type of the source operand. The new behavior will actually write to the field object (buffer field or field unit), thus matching the correct ACPI-defined behavior. Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki Signed-off-by: Lv Zheng Signed-off-by: Kamal Mostafa --- drivers/acpi/acpica/exstore.c | 166 +++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 64 deletions(-) diff --git a/drivers/acpi/acpica/exstore.c b/drivers/acpi/acpica/exstore.c index 4ff37e8e0018..cd7079da1dce 100644 --- a/drivers/acpi/acpica/exstore.c +++ b/drivers/acpi/acpica/exstore.c @@ -57,6 +57,11 @@ acpi_ex_store_object_to_index(union acpi_operand_object *val_desc, union acpi_operand_object *dest_desc, struct acpi_walk_state *walk_state); +static acpi_status +acpi_ex_store_direct_to_node(union acpi_operand_object *source_desc, + struct acpi_namespace_node *node, + struct acpi_walk_state *walk_state); + /******************************************************************************* * * FUNCTION: acpi_ex_store @@ -376,7 +381,11 @@ acpi_ex_store_object_to_index(union acpi_operand_object *source_desc, * When storing into an object the data is converted to the * target object type then stored in the object. This means * that the target object type (for an initialized target) will - * not be changed by a store operation. + * not be changed by a store operation. A copy_object can change + * the target type, however. + * + * The implicit_conversion flag is set to NO/FALSE only when + * storing to an arg_x -- as per the rules of the ACPI spec. * * Assumes parameters are already validated. * @@ -400,7 +409,7 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, target_type = acpi_ns_get_type(node); target_desc = acpi_ns_get_attached_object(node); - ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "Storing %p(%s) into node %p(%s)\n", + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "Storing %p (%s) to node %p (%s)\n", source_desc, acpi_ut_get_object_type_name(source_desc), node, acpi_ut_get_type_name(target_type))); @@ -414,46 +423,31 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, return_ACPI_STATUS(status); } - /* If no implicit conversion, drop into the default case below */ - - if ((!implicit_conversion) || - ((walk_state->opcode == AML_COPY_OP) && - (target_type != ACPI_TYPE_LOCAL_REGION_FIELD) && - (target_type != ACPI_TYPE_LOCAL_BANK_FIELD) && - (target_type != ACPI_TYPE_LOCAL_INDEX_FIELD))) { - /* - * Force execution of default (no implicit conversion). Note: - * copy_object does not perform an implicit conversion, as per the ACPI - * spec -- except in case of region/bank/index fields -- because these - * objects must retain their original type permanently. - */ - target_type = ACPI_TYPE_ANY; - } - /* Do the actual store operation */ switch (target_type) { - case ACPI_TYPE_BUFFER_FIELD: - case ACPI_TYPE_LOCAL_REGION_FIELD: - case ACPI_TYPE_LOCAL_BANK_FIELD: - case ACPI_TYPE_LOCAL_INDEX_FIELD: - - /* For fields, copy the source data to the target field. */ - - status = acpi_ex_write_data_to_field(source_desc, target_desc, - &walk_state->result_obj); - break; - case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_BUFFER: /* - * These target types are all of type Integer/String/Buffer, and - * therefore support implicit conversion before the store. - * - * Copy and/or convert the source object to a new target object + * The simple data types all support implicit source operand + * conversion before the store. */ + + if ((walk_state->opcode == AML_COPY_OP) || !implicit_conversion) { + /* + * However, copy_object and Stores to arg_x do not perform + * an implicit conversion, as per the ACPI specification. + * A direct store is performed instead. + */ + status = acpi_ex_store_direct_to_node(source_desc, node, + walk_state); + break; + } + + /* Store with implicit source operand conversion support */ + status = acpi_ex_store_object_to_object(source_desc, target_desc, &new_desc, walk_state); @@ -467,13 +461,12 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, * the Name's type to that of the value being stored in it. * source_desc reference count is incremented by attach_object. * - * Note: This may change the type of the node if an explicit store - * has been performed such that the node/object type has been - * changed. + * Note: This may change the type of the node if an explicit + * store has been performed such that the node/object type + * has been changed. */ - status = - acpi_ns_attach_object(node, new_desc, - new_desc->common.type); + status = acpi_ns_attach_object(node, new_desc, + new_desc->common.type); ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "Store %s into %s via Convert/Attach\n", @@ -484,38 +477,83 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, } break; + case ACPI_TYPE_BUFFER_FIELD: + case ACPI_TYPE_LOCAL_REGION_FIELD: + case ACPI_TYPE_LOCAL_BANK_FIELD: + case ACPI_TYPE_LOCAL_INDEX_FIELD: + /* + * For all fields, always write the source data to the target + * field. Any required implicit source operand conversion is + * performed in the function below as necessary. Note, field + * objects must retain their original type permanently. + */ + status = acpi_ex_write_data_to_field(source_desc, target_desc, + &walk_state->result_obj); + break; + default: - - ACPI_DEBUG_PRINT((ACPI_DB_EXEC, - "Storing [%s] (%p) directly into node [%s] (%p)" - " with no implicit conversion\n", - acpi_ut_get_object_type_name(source_desc), - source_desc, - acpi_ut_get_object_type_name(target_desc), - node)); - /* * No conversions for all other types. Directly store a copy of - * the source object. NOTE: This is a departure from the ACPI - * spec, which states "If conversion is impossible, abort the - * running control method". + * the source object. This is the ACPI spec-defined behavior for + * the copy_object operator. * - * This code implements "If conversion is impossible, treat the - * Store operation as a CopyObject". + * NOTE: For the Store operator, this is a departure from the + * ACPI spec, which states "If conversion is impossible, abort + * the running control method". Instead, this code implements + * "If conversion is impossible, treat the Store operation as + * a CopyObject". */ - status = - acpi_ut_copy_iobject_to_iobject(source_desc, &new_desc, - walk_state); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } - - status = - acpi_ns_attach_object(node, new_desc, - new_desc->common.type); - acpi_ut_remove_reference(new_desc); + status = acpi_ex_store_direct_to_node(source_desc, node, + walk_state); break; } return_ACPI_STATUS(status); } + +/******************************************************************************* + * + * FUNCTION: acpi_ex_store_direct_to_node + * + * PARAMETERS: source_desc - Value to be stored + * node - Named object to receive the value + * walk_state - Current walk state + * + * RETURN: Status + * + * DESCRIPTION: "Store" an object directly to a node. This involves a copy + * and an attach. + * + ******************************************************************************/ + +static acpi_status +acpi_ex_store_direct_to_node(union acpi_operand_object *source_desc, + struct acpi_namespace_node *node, + struct acpi_walk_state *walk_state) +{ + acpi_status status; + union acpi_operand_object *new_desc; + + ACPI_FUNCTION_TRACE(ex_store_direct_to_node); + + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, + "Storing [%s] (%p) directly into node [%s] (%p)" + " with no implicit conversion\n", + acpi_ut_get_object_type_name(source_desc), + source_desc, acpi_ut_get_type_name(node->type), + node)); + + /* Copy the source object to a new object */ + + status = + acpi_ut_copy_iobject_to_iobject(source_desc, &new_desc, walk_state); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + /* Attach the new object to the node */ + + status = acpi_ns_attach_object(node, new_desc, new_desc->common.type); + acpi_ut_remove_reference(new_desc); + return_ACPI_STATUS(status); +} From a038dd392a7ae48e7bb7ad72adb19a22be36785b Mon Sep 17 00:00:00 2001 From: Kamal Mostafa Date: Fri, 1 Nov 2013 10:31:53 -0700 Subject: [PATCH 05/94] vxlan: fix ip_select_ident skb parameter [3.8-stable only] Fix a call to ip_select_ident() that I missed in commit 1a3365ee55bc5b7e9a752e3a535fd983714d8db2, which is the 3.8 backport of commit 703133de331a7a7df47f31fb9de51dc6f68a9de8 upstream. Cc: Ansis Atteka Cc: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 9b2cc0c00035..cb6f52902cea 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -977,7 +977,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) /* See iptunnel_xmit() */ if (skb->ip_summed != CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_NONE; - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, &rt->dst, NULL); err = ip_local_out(skb); if (likely(net_xmit_eval(err) == 0)) { From 6228c4edf9096509513fd6570d0f1cd43b8ab411 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Aug 2013 05:46:32 -0700 Subject: [PATCH 06/94] tcp: TSO packets automatic sizing commit 95bd09eb27507691520d39ee1044d6ad831c1168 upstream. commit 02cf4ebd82ff0ac7254b88e466820a290ed8289a upstream. commit 7eec4174ff29cd42f2acfae8112f51c228545d40 upstream. After hearing many people over past years complaining against TSO being bursty or even buggy, we are proud to present automatic sizing of TSO packets. One part of the problem is that tcp_tso_should_defer() uses an heuristic relying on upcoming ACKS instead of a timer, but more generally, having big TSO packets makes little sense for low rates, as it tends to create micro bursts on the network, and general consensus is to reduce the buffering amount. This patch introduces a per socket sk_pacing_rate, that approximates the current sending rate, and allows us to size the TSO packets so that we try to send one packet every ms. This field could be set by other transports. Patch has no impact for high speed flows, where having large TSO packets makes sense to reach line rate. For other flows, this helps better packet scheduling and ACK clocking. This patch increases performance of TCP flows in lossy environments. A new sysctl (tcp_min_tso_segs) is added, to specify the minimal size of a TSO packet (default being 2). A follow-up patch will provide a new packet scheduler (FQ), using sk_pacing_rate as an input to perform optional per flow pacing. This explains why we chose to set sk_pacing_rate to twice the current rate, allowing 'slow start' ramp up. sk_pacing_rate = 2 * cwnd * mss / srtt v2: Neal Cardwell reported a suspect deferring of last two segments on initial write of 10 MSS, I had to change tcp_tso_should_defer() to take into account tp->xmit_size_goal_segs Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Van Jacobson Cc: Tom Herbert Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- Documentation/networking/ip-sysctl.txt | 9 +++++++ include/net/sock.h | 2 ++ include/net/tcp.h | 1 + net/core/sock.c | 1 + net/ipv4/sysctl_net_ipv4.c | 10 ++++++++ net/ipv4/tcp.c | 28 +++++++++++++++++---- net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++++- net/ipv4/tcp_output.c | 2 +- 8 files changed, 81 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index dbca66182089..62b9a6196e43 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -510,6 +510,15 @@ tcp_syn_retries - INTEGER tcp_timestamps - BOOLEAN Enable timestamps as defined in RFC1323. +tcp_min_tso_segs - INTEGER + Minimal number of segments per TSO frame. + Since linux-3.12, TCP does an automatic sizing of TSO frames, + depending on flow rate, instead of filling 64Kbytes packets. + For specific usages, it's possible to force TCP to build big + TSO frames. Note that TCP stack might split too big TSO packets + if available window is too small. + Default: 2 + tcp_tso_win_divisor - INTEGER This allows control over what percentage of the congestion window can be consumed by a single TSO frame. diff --git a/include/net/sock.h b/include/net/sock.h index 873abca5da16..94871ccc7a8b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -228,6 +228,7 @@ struct cg_proto; * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_allocation: allocation mode + * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings @@ -352,6 +353,7 @@ struct sock { kmemcheck_bitfield_end(flags); int sk_wmem_queued; gfp_t sk_allocation; + u32 sk_pacing_rate; /* bytes per second */ netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; int sk_gso_type; diff --git a/include/net/tcp.h b/include/net/tcp.h index 4da2167a349e..45f3368ed48e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -292,6 +292,7 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; +extern int sysctl_tcp_min_tso_segs; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/net/core/sock.c b/net/core/sock.c index b8af814cc246..fc0d751da865 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2258,6 +2258,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); + sk->sk_pacing_rate = ~0U; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d22765db4207..ae245cb38e19 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -29,6 +29,7 @@ static int zero; static int one = 1; static int two = 2; +static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -780,6 +781,15 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &two, }, + { + .procname = "tcp_min_tso_segs", + .data = &sysctl_tcp_min_tso_segs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &gso_max_segs, + }, { .procname = "udp_mem", .data = &sysctl_udp_mem, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e7954255e4a..c2aaeab7ccba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,6 +282,8 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; +int sysctl_tcp_min_tso_segs __read_mostly = 2; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -793,12 +795,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, xmit_size_goal = mss_now; if (large_allowed && sk_can_gso(sk)) { - xmit_size_goal = ((sk->sk_gso_max_size - 1) - - inet_csk(sk)->icsk_af_ops->net_header_len - - inet_csk(sk)->icsk_ext_hdr_len - - tp->tcp_header_len); + u32 gso_size, hlen; - /* TSQ : try to have two TSO segments in flight */ + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); + gso_size = max_t(u32, gso_size, + sysctl_tcp_min_tso_segs * mss_now); + + xmit_size_goal = min_t(u32, gso_size, + sk->sk_gso_max_size - 1 - hlen); + + /* TSQ : try to have at least two segments in flight + * (one in NIC TX ring, another in Qdisc) + */ xmit_size_goal = min_t(u32, xmit_size_goal, sysctl_tcp_limit_output_bytes >> 1); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c834d83d2bec..b2263d8f94d9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -705,6 +705,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) } } +/* Set the sk_pacing_rate to allow proper sizing of TSO packets. + * Note: TCP stack does not yet implement pacing. + * FQ packet scheduler can be used to implement cheap but effective + * TCP pacing, to smooth the burst on large writes when packets + * in flight is significantly lower than cwnd (or rwin) + */ +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (u64)tp->mss_cache * 2 * (HZ << 3); + + rate *= max(tp->snd_cwnd, tp->packets_out); + + /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), + * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) + * We probably need usec resolution in the future. + * Note: This also takes care of possible srtt=0 case, + * when tcp_rtt_estimator() was not yet called. + */ + if (tp->srtt > 8 + 2) + do_div(rate, tp->srtt); + + sk->sk_pacing_rate = min_t(u64, rate, ~0U); +} + /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ @@ -3605,7 +3633,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight; + u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; u32 prior_fackets; int prior_packets = tp->packets_out; int prior_sacked = tp->sacked_out; @@ -3723,6 +3751,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (dst) dst_confirm(dst); } + + if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) + tcp_update_pacing_rate(sk); return 1; no_queue: @@ -6063,6 +6094,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } else tcp_init_metrics(sk); + tcp_update_pacing_rate(sk); + /* Prevent spurious tcp_cwnd_restart() on * first data packet. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 436d8fb104df..9d2de6252495 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1784,7 +1784,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) /* If a full-sized TSO skb can be sent, do it. */ if (limit >= min_t(unsigned int, sk->sk_gso_max_size, - sk->sk_gso_max_segs * tp->mss_cache)) + tp->xmit_size_goal_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ From a0eec283906b8602391fe304481f4fc6c864d4b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Sep 2013 03:28:54 -0700 Subject: [PATCH 07/94] tcp: TSQ can use a dynamic limit [ Upstream commit c9eeec26e32e087359160406f96e0949b3cc6f10 ] When TCP Small Queues was added, we used a sysctl to limit amount of packets queues on Qdisc/device queues for a given TCP flow. Problem is this limit is either too big for low rates, or too small for high rates. Now TCP stack has rate estimation in sk->sk_pacing_rate, and TSO auto sizing, it can better control number of packets in Qdisc/device queues. New limit is two packets or at least 1 to 2 ms worth of packets. Low rates flows benefit from this patch by having even smaller number of packets in queues, allowing for faster recovery, better RTT estimations. High rates flows benefit from this patch by allowing more than 2 packets in flight as we had reports this was a limiting factor to reach line rate. [ In particular if TX completion is delayed because of coalescing parameters ] Example for a single flow on 10Gbp link controlled by FQ/pacing 14 packets in flight instead of 2 $ tc -s -d qd qdisc fq 8001: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140 Sent 1168459366606 bytes 771822841 pkt (dropped 0, overlimits 0 requeues 6822476) rate 9346Mbit 771713pps backlog 953820b 14p requeues 6822476 2047 flow, 2046 inactive, 1 throttled, delay 15673 ns 2372 gc, 0 highprio, 0 retrans, 9739249 throttled, 0 flows_plimit Note that sk_pacing_rate is currently set to twice the actual rate, but this might be refined in the future when a flow is in congestion avoidance. Additional change : skb->destructor should be set to tcp_wfree(). A future patch (for linux 3.13+) might remove tcp_limit_output_bytes Signed-off-by: Eric Dumazet Cc: Wei Liu Cc: Cong Wang Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/ipv4/tcp_output.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9d2de6252495..bef538f39c5d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1045,8 +1045,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; - skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? - tcp_wfree : sock_wfree; + skb->destructor = tcp_wfree; atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ @@ -1990,7 +1989,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); @@ -2014,13 +2012,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - /* TSQ : sk_wmem_alloc accounts skb truesize, - * including skb overhead. But thats OK. + /* TCP Small Queues : + * Control number of packets in qdisc/devices to two packets / or ~1 ms. + * This allows for : + * - better RTT estimation and ACK scheduling + * - faster recovery + * - high rates */ - if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { + limit = max(skb->truesize, sk->sk_pacing_rate >> 10); + + if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); break; } + limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, From dd3b9cd06513c168c19d50b44944086d0f79ba6e Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 14 Feb 2013 09:44:49 +0000 Subject: [PATCH 08/94] net: Add skb_unclone() helper function. commit 14bbd6a565e1bcdc240d44687edb93f721cfdf99 upstream. This function will be used in next GRE_GSO patch. This patch does not change any functionality. Signed-off-by: Pravin B Shelar Acked-by: Eric Dumazet [ kamal: 3.8-stable prereq for c52e2421f7368fd36cbe330d2cf41b10452e39a9 "tcp: must unclone packets before mangling them" ] Signed-off-by: Kamal Mostafa --- drivers/net/ppp/ppp_generic.c | 3 +-- include/linux/skbuff.h | 10 ++++++++++ net/ipv4/ah4.c | 3 +-- net/ipv4/ip_fragment.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/xfrm4_input.c | 2 +- net/ipv4/xfrm4_mode_tunnel.c | 3 +-- net/ipv6/ah6.c | 3 +-- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 2 +- net/ipv6/xfrm6_mode_tunnel.c | 3 +-- net/sched/act_ipt.c | 6 ++---- net/sched/act_pedit.c | 3 +-- 13 files changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 508570e6fc28..3db9131e9229 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1813,8 +1813,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) /* the filter instructions are constructed assuming a four-byte PPP header on each packet */ if (ppp->pass_filter || ppp->active_filter) { - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto err; *skb_push(skb, 2) = 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9fe54b674e13..24f93a104033 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -797,6 +797,16 @@ static inline int skb_cloned(const struct sk_buff *skb) (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1; } +static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) +{ + might_sleep_if(pri & __GFP_WAIT); + + if (skb_cloned(skb)) + return pskb_expand_head(skb, 0, 0, pri); + + return 0; +} + /** * skb_header_cloned - is the header a clone * @skb: buffer to check diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a69b4e4a02b5..2e7f1948216f 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -321,8 +321,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 0fcfee37227e..24f20a0573c4 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -598,7 +598,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; /* If the first fragment is fragmented itself, we split diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bef538f39c5d..acda7284112f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1331,7 +1331,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; __pskb_trim_head(skb, len); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216dc..1f12c8b45864 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -132,7 +132,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) * header and optional ESP marker bytes) and then modify the * protocol to ESP, and then call into the transform receiver. */ - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto drop; /* Now we can update and verify the packet length... */ diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 91ab23b9bf82..57dfe2b496f3 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -142,8 +142,7 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) for_each_input_rcu(rcv_notify_handlers, handler) handler->handler(skb); - if (skb_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + if (err = skb_unclone(skb, GFP_ATOMIC)) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 384233188ac1..bb02e176cb70 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -521,8 +521,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 2f3a018f8895..d05b6eea4977 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -369,7 +369,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) } /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + if (skb_unclone(head, GFP_ATOMIC)) { pr_debug("skb is cloned but can't expand head"); goto out_oom; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 1aa1434d5e37..6090a11d0e67 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -419,7 +419,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_oom; /* If the first fragment is fragmented itself, we split diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 9f2095b19ad0..93c41a81c4c3 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -69,8 +69,7 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; - if (skb_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + if (err = skb_unclone(skb, GFP_ATOMIC)) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index c1a81389b1b8..ae648700cade 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -207,10 +207,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_ipt *ipt = a->priv; struct xt_action_param par; - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return TC_ACT_UNSPEC; - } + if (skb_unclone(skb, GFP_ATOMIC)) + return TC_ACT_UNSPEC; spin_lock(&ipt->tcf_lock); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 45c53ab067a6..fe4bb5b43d95 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -130,8 +130,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, int i, munged = 0; unsigned int off; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return p->tcf_action; off = skb_network_offset(skb); From c51871e460872cf8e98ff5e184df116b0322cb7e Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 16 Feb 2013 07:38:15 +0000 Subject: [PATCH 09/94] ipv6: fix warning in xfrm6_mode_tunnel_input commit 0d4bfa297c3f7efb71367449ee44f9d3fb0f5871 upstream. Should not use assignment in conditional: warning: suggest parentheses around assignment used as truth value [-Wparentheses] Problem introduced by: commit 14bbd6a565e1bcdc240d44687edb93f721cfdf99 Author: Pravin B Shelar Date: Thu Feb 14 09:44:49 2013 +0000 net: Add skb_unclone() helper function. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/ipv6/xfrm6_mode_tunnel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 93c41a81c4c3..9bf6a74a71d2 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -69,7 +69,8 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; - if (err = skb_unclone(skb, GFP_ATOMIC)) + err = skb_unclone(skb, GFP_ATOMIC); + if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) From 8955445d9d0042f6ddf45af21a9704f809d2bccf Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 16 Feb 2013 07:40:29 +0000 Subject: [PATCH 10/94] ip: fix warning in xfrm4_mode_tunnel_input commit 9aac22deb11a3da4df5b868fc3d30b07185b0d71 upstream. Same problem as IPv6 Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/ipv4/xfrm4_mode_tunnel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 57dfe2b496f3..175e8b139bb8 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -142,7 +142,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) for_each_input_rcu(rcv_notify_handlers, handler) handler->handler(skb); - if (err = skb_unclone(skb, GFP_ATOMIC)) + err = skb_unclone(skb, GFP_ATOMIC); + if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) From f07d939b766e4b76208a97ce3a50d953dbecc72e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Oct 2013 11:54:30 -0700 Subject: [PATCH 11/94] tcp: must unclone packets before mangling them [ Upstream commit c52e2421f7368fd36cbe330d2cf41b10452e39a9 ] TCP stack should make sure it owns skbs before mangling them. We had various crashes using bnx2x, and it turned out gso_size was cleared right before bnx2x driver was populating TC descriptor of the _previous_ packet send. TCP stack can sometime retransmit packets that are still in Qdisc. Of course we could make bnx2x driver more robust (using ACCESS_ONCE(shinfo->gso_size) for example), but the bug is TCP stack. We have identified two points where skb_unclone() was needed. This patch adds a WARN_ON_ONCE() to warn us if we missed another fix of this kind. Kudos to Neal for finding the root cause of this bug. Its visible using small MSS. Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/ipv4/tcp_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index acda7284112f..0eaef9ae3905 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1134,6 +1134,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { + /* Make sure we own this skb before messing gso_size/gso_segs */ + WARN_ON_ONCE(skb_cloned(skb)); + if (skb->len <= mss_now || !sk_can_gso(sk) || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal @@ -1215,9 +1218,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; /* Get a new skb... force flag on. */ @@ -2368,6 +2369,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int oldpcount = tcp_skb_pcount(skb); if (unlikely(oldpcount > 1)) { + if (skb_unclone(skb, GFP_ATOMIC)) + return -ENOMEM; tcp_init_tso_segs(sk, skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } From 31fea680a242540448ccdaa2fb19152870fbfd09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2013 10:31:41 -0700 Subject: [PATCH 12/94] tcp: do not forget FIN in tcp_shifted_skb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 5e8a402f831dbe7ee831340a91439e46f0d38acd ] Yuchung found following problem : There are bugs in the SACK processing code, merging part in tcp_shift_skb_data(), that incorrectly resets or ignores the sacked skbs FIN flag. When a receiver first SACK the FIN sequence, and later throw away ofo queue (e.g., sack-reneging), the sender will stop retransmitting the FIN flag, and hangs forever. Following packetdrill test can be used to reproduce the bug. $ cat sack-merge-bug.pkt `sysctl -q net.ipv4.tcp_fack=0` // Establish a connection and send 10 MSS. 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +.000 bind(3, ..., ...) = 0 +.000 listen(3, 1) = 0 +.050 < S 0:0(0) win 32792 +.000 > S. 0:0(0) ack 1 +.001 < . 1:1(0) ack 1 win 1024 +.000 accept(3, ..., ...) = 4 +.100 write(4, ..., 12000) = 12000 +.000 shutdown(4, SHUT_WR) = 0 +.000 > . 1:10001(10000) ack 1 +.050 < . 1:1(0) ack 2001 win 257 +.000 > FP. 10001:12001(2000) ack 1 +.050 < . 1:1(0) ack 2001 win 257 +.050 < . 1:1(0) ack 2001 win 257 // SACK reneg +.050 < . 1:1(0) ack 12001 win 257 +0 %{ print "unacked: ",tcpi_unacked }% +5 %{ print "" }% First, a typo inverted left/right of one OR operation, then code forgot to advance end_seq if the merged skb carried FIN. Bug was added in 2.6.29 by commit 832d11c5cd076ab ("tcp: Try to restore large SKBs while SACK processing") Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Cc: Ilpo Järvinen Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/ipv4/tcp_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b2263d8f94d9..9bef66b9a1e4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1300,7 +1300,10 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tp->lost_cnt_hint -= tcp_skb_pcount(prev); } - TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; + TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + TCP_SKB_CB(prev)->end_seq++; + if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); From 4f207ec1232b995516943115e55859fd308dce54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Oct 2013 21:04:11 -0700 Subject: [PATCH 13/94] net: do not call sock_put() on TIMEWAIT sockets [ Upstream commit 80ad1d61e72d626e30ebe8529a0455e660ca4693 ] commit 3ab5aee7fe84 ("net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls") incorrectly used sock_put() on TIMEWAIT sockets. We should instead use inet_twsk_put() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/ipv4/inet_hashtables.c | 2 +- net/ipv6/inet6_hashtables.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fa3ae8148710..341508d5c204 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -274,7 +274,7 @@ begintw: if (unlikely(!INET_TW_MATCH(sk, net, acookie, saddr, daddr, ports, dif))) { - sock_put(sk); + inet_twsk_put(inet_twsk(sk)); goto begintw; } goto out; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index dea17fd28e50..b9a7bfb40c5e 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -116,7 +116,7 @@ begintw: } if (unlikely(!INET6_TW_MATCH(sk, net, saddr, daddr, ports, dif))) { - sock_put(sk); + inet_twsk_put(inet_twsk(sk)); goto begintw; } goto out; From 9ccc0b07fc1a29e19cd64a9fdd60113ae31d8fbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Cachereul?= Date: Wed, 2 Oct 2013 10:16:02 +0200 Subject: [PATCH 14/94] l2tp: fix kernel panic when using IPv4-mapped IPv6 addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e18503f41f9b12132c95d7c31ca6ee5155e44e5c ] IPv4 mapped addresses cause kernel panic. The patch juste check whether the IPv6 address is an IPv4 mapped address. If so, use IPv4 API instead of IPv6. [ 940.026915] general protection fault: 0000 [#1] [ 940.026915] Modules linked in: l2tp_ppp l2tp_netlink l2tp_core pppox ppp_generic slhc loop psmouse [ 940.026915] CPU: 0 PID: 3184 Comm: memcheck-amd64- Not tainted 3.11.0+ #1 [ 940.026915] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 940.026915] task: ffff880007130e20 ti: ffff88000737e000 task.ti: ffff88000737e000 [ 940.026915] RIP: 0010:[] [] ip6_xmit+0x276/0x326 [ 940.026915] RSP: 0018:ffff88000737fd28 EFLAGS: 00010286 [ 940.026915] RAX: c748521a75ceff48 RBX: ffff880000c30800 RCX: 0000000000000000 [ 940.026915] RDX: ffff88000075cc4e RSI: 0000000000000028 RDI: ffff8800060e5a40 [ 940.026915] RBP: ffff8800060e5a40 R08: 0000000000000000 R09: ffff88000075cc90 [ 940.026915] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88000737fda0 [ 940.026915] R13: 0000000000000000 R14: 0000000000002000 R15: ffff880005d3b580 [ 940.026915] FS: 00007f163dc5e800(0000) GS:ffffffff81623000(0000) knlGS:0000000000000000 [ 940.026915] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 940.026915] CR2: 00000004032dc940 CR3: 0000000005c25000 CR4: 00000000000006f0 [ 940.026915] Stack: [ 940.026915] ffff88000075cc4e ffffffff81694e90 ffff880000c30b38 0000000000000020 [ 940.026915] 11000000523c4bac ffff88000737fdb4 0000000000000000 ffff880000c30800 [ 940.026915] ffff880005d3b580 ffff880000c30b38 ffff8800060e5a40 0000000000000020 [ 940.026915] Call Trace: [ 940.026915] [] ? inet6_csk_xmit+0xa4/0xc4 [ 940.026915] [] ? l2tp_xmit_skb+0x503/0x55a [l2tp_core] [ 940.026915] [] ? pskb_expand_head+0x161/0x214 [ 940.026915] [] ? pppol2tp_xmit+0xf2/0x143 [l2tp_ppp] [ 940.026915] [] ? ppp_channel_push+0x36/0x8b [ppp_generic] [ 940.026915] [] ? ppp_write+0xaf/0xc5 [ppp_generic] [ 940.026915] [] ? vfs_write+0xa2/0x106 [ 940.026915] [] ? SyS_write+0x56/0x8a [ 940.026915] [] ? system_call_fastpath+0x16/0x1b [ 940.026915] Code: 00 49 8b 8f d8 00 00 00 66 83 7c 11 02 00 74 60 49 8b 47 58 48 83 e0 fe 48 8b 80 18 01 00 00 48 85 c0 74 13 48 8b 80 78 02 00 00 <48> ff 40 28 41 8b 57 68 48 01 50 30 48 8b 54 24 08 49 c7 c1 51 [ 940.026915] RIP [] ip6_xmit+0x276/0x326 [ 940.026915] RSP [ 940.057945] ---[ end trace be8aba9a61c8b7f3 ]--- [ 940.058583] Kernel panic - not syncing: Fatal exception in interrupt Signed-off-by: François CACHEREUL Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/l2tp/l2tp_core.c | 27 +++++++++++++++++++++++---- net/l2tp/l2tp_core.h | 3 +++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 2ac884d0e89b..8861e9fbeb0b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -517,6 +517,7 @@ out: static inline int l2tp_verify_udp_checksum(struct sock *sk, struct sk_buff *skb) { + struct l2tp_tunnel *tunnel = (struct l2tp_tunnel *)sk->sk_user_data; struct udphdr *uh = udp_hdr(skb); u16 ulen = ntohs(uh->len); __wsum psum; @@ -525,7 +526,7 @@ static inline int l2tp_verify_udp_checksum(struct sock *sk, return 0; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6) { + if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) { if (!uh->check) { LIMIT_NETDEBUG(KERN_INFO "L2TP: IPv6: checksum is 0\n"); return 1; @@ -1088,7 +1089,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, /* Queue the packet to IP for output */ skb->local_df = 1; #if IS_ENABLED(CONFIG_IPV6) - if (skb->sk->sk_family == PF_INET6) + if (skb->sk->sk_family == PF_INET6 && !tunnel->v4mapped) error = inet6_csk_xmit(skb, NULL); else #endif @@ -1221,7 +1222,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len /* Calculate UDP checksum if configured to do so */ #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6) + if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) l2tp_xmit_ipv6_csum(sk, skb, udp_len); else #endif @@ -1624,6 +1625,24 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 if (cfg != NULL) tunnel->debug = cfg->debug; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (ipv6_addr_v4mapped(&np->saddr) && + ipv6_addr_v4mapped(&np->daddr)) { + struct inet_sock *inet = inet_sk(sk); + + tunnel->v4mapped = true; + inet->inet_saddr = np->saddr.s6_addr32[3]; + inet->inet_rcv_saddr = np->rcv_saddr.s6_addr32[3]; + inet->inet_daddr = np->daddr.s6_addr32[3]; + } else { + tunnel->v4mapped = false; + } + } +#endif + /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ tunnel->encap = encap; if (encap == L2TP_ENCAPTYPE_UDP) { @@ -1631,7 +1650,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP; udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6) + if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) udpv6_encap_enable(); else #endif diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index e62204cad4fe..8d6a0486c3a8 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -190,6 +190,9 @@ struct l2tp_tunnel { struct sock *sock; /* Parent socket */ int fd; /* Parent fd, if tunnel socket * was created by userspace */ +#if IS_ENABLED(CONFIG_IPV6) + bool v4mapped; +#endif uint8_t priv[0]; /* private data */ }; From 81811da0320e829c9c5f3c46f54acac3a1373bbf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Oct 2013 15:44:26 -0400 Subject: [PATCH 15/94] l2tp: Fix build warning with ipv6 disabled. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8d8a51e26a6d415e1470759f2cf5f3ee3ee86196 ] net/l2tp/l2tp_core.c: In function ‘l2tp_verify_udp_checksum’: net/l2tp/l2tp_core.c:499:22: warning: unused variable ‘tunnel’ [-Wunused-variable] Create a helper "l2tp_tunnel()" to facilitate this, and as a side effect get rid of a bunch of unnecessary void pointer casts. Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/l2tp/l2tp_core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 8861e9fbeb0b..02a922ceeaef 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -115,6 +115,11 @@ static void l2tp_session_set_header_len(struct l2tp_session *session, int versio static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); +static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) +{ + return sk->sk_user_data; +} + static inline struct l2tp_net *l2tp_pernet(struct net *net) { BUG_ON(!net); @@ -517,7 +522,6 @@ out: static inline int l2tp_verify_udp_checksum(struct sock *sk, struct sk_buff *skb) { - struct l2tp_tunnel *tunnel = (struct l2tp_tunnel *)sk->sk_user_data; struct udphdr *uh = udp_hdr(skb); u16 ulen = ntohs(uh->len); __wsum psum; @@ -526,7 +530,7 @@ static inline int l2tp_verify_udp_checksum(struct sock *sk, return 0; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) { + if (sk->sk_family == PF_INET6 && !l2tp_tunnel(sk)->v4mapped) { if (!uh->check) { LIMIT_NETDEBUG(KERN_INFO "L2TP: IPv6: checksum is 0\n"); return 1; @@ -1271,9 +1275,8 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb); */ static void l2tp_tunnel_destruct(struct sock *sk) { - struct l2tp_tunnel *tunnel; + struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); - tunnel = sk->sk_user_data; if (tunnel == NULL) goto end; @@ -1596,7 +1599,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 } /* Check if this socket has already been prepped */ - tunnel = (struct l2tp_tunnel *)sk->sk_user_data; + tunnel = l2tp_tunnel(sk); if (tunnel != NULL) { /* This socket has already been prepped */ err = -EBUSY; From 03b9cf62f19acae9f497fd584c9a8a19928b7e66 Mon Sep 17 00:00:00 2001 From: Sebastian Hesselbarth Date: Wed, 2 Oct 2013 12:57:20 +0200 Subject: [PATCH 16/94] net: mv643xx_eth: update statistics timer from timer context only [ Upstream commit 041b4ddb84989f06ff1df0ca869b950f1ee3cb1c ] Each port driver installs a periodic timer to update port statistics by calling mib_counters_update. As mib_counters_update is also called from non-timer context, we should not reschedule the timer there but rather move it to timer-only context. Signed-off-by: Sebastian Hesselbarth Acked-by: Jason Cooper Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/ethernet/marvell/mv643xx_eth.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 84c13263c514..6d89717a2179 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -1273,15 +1273,13 @@ static void mib_counters_update(struct mv643xx_eth_private *mp) p->rx_discard += rdlp(mp, RX_DISCARD_FRAME_CNT); p->rx_overrun += rdlp(mp, RX_OVERRUN_FRAME_CNT); spin_unlock_bh(&mp->mib_counters_lock); - - mod_timer(&mp->mib_counters_timer, jiffies + 30 * HZ); } static void mib_counters_timer_wrapper(unsigned long _mp) { struct mv643xx_eth_private *mp = (void *)_mp; - mib_counters_update(mp); + mod_timer(&mp->mib_counters_timer, jiffies + 30 * HZ); } From 1b0bd3cb6b171711649fea8db995db8a6616b6dc Mon Sep 17 00:00:00 2001 From: Sebastian Hesselbarth Date: Wed, 2 Oct 2013 12:57:21 +0200 Subject: [PATCH 17/94] net: mv643xx_eth: fix orphaned statistics timer crash [ Upstream commit f564412c935111c583b787bcc18157377b208e2e ] The periodic statistics timer gets started at port _probe() time, but is stopped on _stop() only. In a modular environment, this can cause the timer to access already deallocated memory, if the module is unloaded without starting the eth device. To fix this, we add the timer right before the port is started, instead of at _probe() time. Signed-off-by: Sebastian Hesselbarth Acked-by: Jason Cooper Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 6d89717a2179..0733685d677e 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2366,6 +2366,7 @@ static int mv643xx_eth_open(struct net_device *dev) mp->int_mask |= INT_TX_END_0 << i; } + add_timer(&mp->mib_counters_timer); port_start(mp); wrlp(mp, INT_MASK_EXT, INT_EXT_LINK_PHY | INT_EXT_TX); @@ -2913,7 +2914,6 @@ static int mv643xx_eth_probe(struct platform_device *pdev) mp->mib_counters_timer.data = (unsigned long)mp; mp->mib_counters_timer.function = mib_counters_timer_wrapper; mp->mib_counters_timer.expires = jiffies + 30 * HZ; - add_timer(&mp->mib_counters_timer); spin_lock_init(&mp->mib_counters_lock); From b9b0ef8ea02978d6cebdd8ef4ab14aeacc807606 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 3 Oct 2013 00:27:20 +0300 Subject: [PATCH 18/94] net: heap overflow in __audit_sockaddr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1661bf364ae9c506bc8795fef70d1532931be1e8 ] We need to cap ->msg_namelen or it leads to a buffer overflow when we to the memcpy() in __audit_sockaddr(). It requires CAP_AUDIT_CONTROL to exploit this bug. The call tree is: ___sys_recvmsg() move_addr_to_user() audit_sockaddr() __audit_sockaddr() Reported-by: Jüri Aedla Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/compat.c | 2 ++ net/socket.c | 24 ++++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/net/compat.c b/net/compat.c index f0a1ba6c8086..89032580bd1d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -71,6 +71,8 @@ int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg) __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || __get_user(kmsg->msg_flags, &umsg->msg_flags)) return -EFAULT; + if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) + return -EINVAL; kmsg->msg_name = compat_ptr(tmp1); kmsg->msg_iov = compat_ptr(tmp2); kmsg->msg_control = compat_ptr(tmp3); diff --git a/net/socket.c b/net/socket.c index a61db0681f0a..809e94116b74 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1980,6 +1980,16 @@ struct used_address { unsigned int name_len; }; +static int copy_msghdr_from_user(struct msghdr *kmsg, + struct msghdr __user *umsg) +{ + if (copy_from_user(kmsg, umsg, sizeof(struct msghdr))) + return -EFAULT; + if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) + return -EINVAL; + return 0; +} + static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address) @@ -1998,8 +2008,11 @@ static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg, if (MSG_CMSG_COMPAT & flags) { if (get_compat_msghdr(msg_sys, msg_compat)) return -EFAULT; - } else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr))) - return -EFAULT; + } else { + err = copy_msghdr_from_user(msg_sys, msg); + if (err) + return err; + } if (msg_sys->msg_iovlen > UIO_FASTIOV) { err = -EMSGSIZE; @@ -2207,8 +2220,11 @@ static int ___sys_recvmsg(struct socket *sock, struct msghdr __user *msg, if (MSG_CMSG_COMPAT & flags) { if (get_compat_msghdr(msg_sys, msg_compat)) return -EFAULT; - } else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr))) - return -EFAULT; + } else { + err = copy_msghdr_from_user(msg_sys, msg); + if (err) + return err; + } if (msg_sys->msg_iovlen > UIO_FASTIOV) { err = -EMSGSIZE; From 924ff65c3b39f902bd183577008073c0e8055ae7 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:03:06 +0200 Subject: [PATCH 19/94] proc connector: fix info leaks [ Upstream commit e727ca82e0e9616ab4844301e6bae60ca7327682 ] Initialize event_data for all possible message types to prevent leaking kernel stack contents to userland (up to 20 bytes). Also set the flags member of the connector message to 0 to prevent leaking two more stack bytes this way. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- drivers/connector/cn_proc.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 1110478dd0fd..1b1d255f0e15 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -65,6 +65,7 @@ void proc_fork_connector(struct task_struct *task) msg = (struct cn_msg *)buffer; ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); @@ -80,6 +81,7 @@ void proc_fork_connector(struct task_struct *task) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ /* If cn_netlink_send() failed, the data is not sent */ cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } @@ -96,6 +98,7 @@ void proc_exec_connector(struct task_struct *task) msg = (struct cn_msg *)buffer; ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); @@ -106,6 +109,7 @@ void proc_exec_connector(struct task_struct *task) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } @@ -122,6 +126,7 @@ void proc_id_connector(struct task_struct *task, int which_id) msg = (struct cn_msg *)buffer; ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->what = which_id; ev->event_data.id.process_pid = task->pid; ev->event_data.id.process_tgid = task->tgid; @@ -145,6 +150,7 @@ void proc_id_connector(struct task_struct *task, int which_id) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } @@ -160,6 +166,7 @@ void proc_sid_connector(struct task_struct *task) msg = (struct cn_msg *)buffer; ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); @@ -170,6 +177,7 @@ void proc_sid_connector(struct task_struct *task) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } @@ -185,6 +193,7 @@ void proc_ptrace_connector(struct task_struct *task, int ptrace_id) msg = (struct cn_msg *)buffer; ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); @@ -203,6 +212,7 @@ void proc_ptrace_connector(struct task_struct *task, int ptrace_id) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } @@ -218,6 +228,7 @@ void proc_comm_connector(struct task_struct *task) msg = (struct cn_msg *)buffer; ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); @@ -229,6 +240,7 @@ void proc_comm_connector(struct task_struct *task) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } @@ -244,6 +256,7 @@ void proc_exit_connector(struct task_struct *task) msg = (struct cn_msg *)buffer; ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); @@ -256,6 +269,8 @@ void proc_exit_connector(struct task_struct *task) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ + msg->flags = 0; /* not used */ cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } @@ -279,6 +294,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack) msg = (struct cn_msg *)buffer; ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); msg->seq = rcvd_seq; ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); @@ -288,6 +304,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = rcvd_ack + 1; msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } From 8bf2f23fd55cb7a83817a97f7678a0a0e6804c35 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 4 Oct 2013 17:04:48 +0200 Subject: [PATCH 20/94] ipv4: fix ineffective source address selection [ Upstream commit 0a7e22609067ff524fc7bbd45c6951dd08561667 ] When sending out multicast messages, the source address in inet->mc_addr is ignored and rewritten by an autoselected one. This is caused by a typo in commit 813b3b5db831 ("ipv4: Use caller's on-stack flowi as-is in output route lookups"). Signed-off-by: Jiri Benc Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 70da49448f8f..0bf88e431460 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2024,7 +2024,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) RT_SCOPE_LINK); goto make_route; } - if (fl4->saddr) { + if (!fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); From 251feee819cb0c623a7bf513a78d1c5997b8967f Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Sat, 5 Oct 2013 21:25:17 +0200 Subject: [PATCH 21/94] can: dev: fix nlmsg size calculation in can_get_size() [ Upstream commit fe119a05f8ca481623a8d02efcc984332e612528 ] This patch fixes the calculation of the nlmsg size, by adding the missing nla_total_size(). Signed-off-by: Marc Kleine-Budde Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/can/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 8233e5ed2939..6afbe462710b 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -698,14 +698,14 @@ static size_t can_get_size(const struct net_device *dev) size_t size; size = nla_total_size(sizeof(u32)); /* IFLA_CAN_STATE */ - size += sizeof(struct can_ctrlmode); /* IFLA_CAN_CTRLMODE */ + size += nla_total_size(sizeof(struct can_ctrlmode)); /* IFLA_CAN_CTRLMODE */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_RESTART_MS */ - size += sizeof(struct can_bittiming); /* IFLA_CAN_BITTIMING */ - size += sizeof(struct can_clock); /* IFLA_CAN_CLOCK */ + size += nla_total_size(sizeof(struct can_bittiming)); /* IFLA_CAN_BITTIMING */ + size += nla_total_size(sizeof(struct can_clock)); /* IFLA_CAN_CLOCK */ if (priv->do_get_berr_counter) /* IFLA_CAN_BERR_COUNTER */ - size += sizeof(struct can_berr_counter); + size += nla_total_size(sizeof(struct can_berr_counter)); if (priv->bittiming_const) /* IFLA_CAN_BITTIMING_CONST */ - size += sizeof(struct can_bittiming_const); + size += nla_total_size(sizeof(struct can_bittiming_const)); return size; } From d7e86bbbfb3bea490e21867c5057893177574fbc Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 5 Oct 2013 17:56:59 -0300 Subject: [PATCH 22/94] net: secure_seq: Fix warning when CONFIG_IPV6 and CONFIG_INET are not selected [ Upstream commit cb03db9d0e964568407fb08ea46cc2b6b7f67587 ] net_secret() is only used when CONFIG_IPV6 or CONFIG_INET are selected. Building a defconfig with both of these symbols unselected (Using the ARM at91sam9rl_defconfig, for example) leads to the following build warning: $ make at91sam9rl_defconfig $ make net/core/secure_seq.o scripts/kconfig/conf --silentoldconfig Kconfig CHK include/config/kernel.release CHK include/generated/uapi/linux/version.h CHK include/generated/utsrelease.h make[1]: `include/generated/mach-types.h' is up to date. CALL scripts/checksyscalls.sh CC net/core/secure_seq.o net/core/secure_seq.c:17:13: warning: 'net_secret_init' defined but not used [-Wunused-function] Fix this warning by protecting the definition of net_secret() with these symbols. Reported-by: Olof Johansson Signed-off-by: Fabio Estevam Signed-off-by: David S. Miller [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- net/core/secure_seq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index e61a8bb7fce7..53fcc11a2d51 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -10,6 +10,7 @@ #include +#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; static int __init net_secret_init(void) @@ -18,6 +19,7 @@ static int __init net_secret_init(void) return 0; } late_initcall(net_secret_init); +#endif #ifdef CONFIG_INET static u32 seq_scale(u32 seq) From 7fa787c95dac5784d245e0d3a6dbc5fc6b73477d Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Tue, 8 Oct 2013 14:22:56 +0100 Subject: [PATCH 23/94] xen-netback: Don't destroy the netdev until the vif is shut down [ Upstream commit 279f438e36c0a70b23b86d2090aeec50155034a9 ] Without this patch, if a frontend cycles through states Closing and Closed (which Windows frontends need to do) then the netdev will be destroyed and requires re-invocation of hotplug scripts to restore state before the frontend can move to Connected. Thus when udev is not in use the backend gets stuck in InitWait. With this patch, the netdev is left alone whilst the backend is still online and is only de-registered and freed just prior to destroying the vif (which is also nicely symmetrical with the netdev allocation and registration being done during probe) so no re-invocation of hotplug scripts is required. Signed-off-by: Paul Durrant Cc: David Vrabel Cc: Wei Liu Cc: Ian Campbell Signed-off-by: Kamal Mostafa --- drivers/net/xen-netback/common.h | 1 + drivers/net/xen-netback/interface.c | 12 ++++++++++-- drivers/net/xen-netback/xenbus.c | 19 +++++++++++++------ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 9d7f1723dd8f..1a285083d24a 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -115,6 +115,7 @@ struct xenvif *xenvif_alloc(struct device *parent, int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, unsigned long rx_ring_ref, unsigned int evtchn); void xenvif_disconnect(struct xenvif *vif); +void xenvif_free(struct xenvif *vif); void xenvif_get(struct xenvif *vif); void xenvif_put(struct xenvif *vif); diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 221f4265f7d6..2ef5ec98b97b 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -302,6 +302,9 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, } netdev_dbg(dev, "Successfully created xenvif\n"); + + __module_get(THIS_MODULE); + return vif; } @@ -367,9 +370,14 @@ void xenvif_disconnect(struct xenvif *vif) if (vif->irq) unbind_from_irqhandler(vif->irq, vif); + xen_netbk_unmap_frontend_rings(vif); +} + +void xenvif_free(struct xenvif *vif) +{ unregister_netdev(vif->dev); - xen_netbk_unmap_frontend_rings(vif); - free_netdev(vif->dev); + + module_put(THIS_MODULE); } diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 410018c4c528..abe24ff000f0 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -42,7 +42,7 @@ static int netback_remove(struct xenbus_device *dev) if (be->vif) { kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); - xenvif_disconnect(be->vif); + xenvif_free(be->vif); be->vif = NULL; } kfree(be); @@ -203,9 +203,18 @@ static void disconnect_backend(struct xenbus_device *dev) { struct backend_info *be = dev_get_drvdata(&dev->dev); - if (be->vif) { - xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); + if (be->vif) xenvif_disconnect(be->vif); +} + +static void destroy_backend(struct xenbus_device *dev) +{ + struct backend_info *be = dev_get_drvdata(&dev->dev); + + if (be->vif) { + kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); + xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); + xenvif_free(be->vif); be->vif = NULL; } } @@ -237,14 +246,11 @@ static void frontend_changed(struct xenbus_device *dev, case XenbusStateConnected: if (dev->state == XenbusStateConnected) break; - backend_create_xenvif(be); if (be->vif) connect(be); break; case XenbusStateClosing: - if (be->vif) - kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); disconnect_backend(dev); xenbus_switch_state(dev, XenbusStateClosing); break; @@ -253,6 +259,7 @@ static void frontend_changed(struct xenbus_device *dev, xenbus_switch_state(dev, XenbusStateClosed); if (xenbus_dev_is_online(dev)) break; + destroy_backend(dev); /* fall through if not online */ case XenbusStateUnknown: device_unregister(&dev->dev); From d2732c18db0399c59c34a8637ac8dc8a0e6dab3a Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 7 Oct 2013 23:19:58 +0200 Subject: [PATCH 24/94] net: vlan: fix nlmsg size calculation in vlan_get_size() [ Upstream commit c33a39c575068c2ea9bffb22fd6de2df19c74b89 ] This patch fixes the calculation of the nlmsg size, by adding the missing nla_total_size(). Cc: Patrick McHardy Signed-off-by: Marc Kleine-Budde Signed-off-by: David S. Miller [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- net/8021q/vlan_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 708c80ea1874..ebde0306eb83 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -152,7 +152,7 @@ static size_t vlan_get_size(const struct net_device *dev) struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return nla_total_size(2) + /* IFLA_VLAN_ID */ - sizeof(struct ifla_vlan_flags) + /* IFLA_VLAN_FLAGS */ + nla_total_size(sizeof(struct ifla_vlan_flags)) + /* IFLA_VLAN_FLAGS */ vlan_qos_map_size(vlan->nr_ingress_mappings) + vlan_qos_map_size(vlan->nr_egress_mappings); } From 44bdc4a2f8d3a25c81f768c7b92d5165e42dfd2d Mon Sep 17 00:00:00 2001 From: Christophe Gouault Date: Tue, 8 Oct 2013 17:21:22 +0200 Subject: [PATCH 25/94] vti: get rid of nf mark rule in prerouting [ Upstream commit 7263a5187f9e9de45fcb51349cf0e031142c19a1 ] This patch fixes and improves the use of vti interfaces (while lightly changing the way of configuring them). Currently: - it is necessary to identify and mark inbound IPsec packets destined to each vti interface, via netfilter rules in the mangle table at prerouting hook. - the vti module cannot retrieve the right tunnel in input since commit b9959fd3: vti tunnels all have an i_key, but the tunnel lookup is done with flag TUNNEL_NO_KEY, so there no chance to retrieve them. - the i_key is used by the outbound processing as a mark to lookup for the right SP and SA bundle. This patch uses the o_key to store the vti mark (instead of i_key) and enables: - to avoid the need for previously marking the inbound skbuffs via a netfilter rule. - to properly retrieve the right tunnel in input, only based on the IPsec packet outer addresses. - to properly perform an inbound policy check (using the tunnel o_key as a mark). - to properly perform an outbound SPD and SAD lookup (using the tunnel o_key as a mark). - to keep the current mark of the skbuff. The skbuff mark is neither used nor changed by the vti interface. Only the vti interface o_key is used. SAs have a wildcard mark. SPs have a mark equal to the vti interface o_key. The vti interface must be created as follows (i_key = 0, o_key = mark): ip link add vti1 mode vti local 1.1.1.1 remote 2.2.2.2 okey 1 The SPs attached to vti1 must be created as follows (mark = vti1 o_key): ip xfrm policy add dir out mark 1 tmpl src 1.1.1.1 dst 2.2.2.2 \ proto esp mode tunnel ip xfrm policy add dir in mark 1 tmpl src 2.2.2.2 dst 1.1.1.1 \ proto esp mode tunnel The SAs are created with the default wildcard mark. There is no distinction between global vs. vti SAs. Just their addresses will possibly link them to a vti interface: ip xfrm state add src 1.1.1.1 dst 2.2.2.2 proto esp spi 1000 mode tunnel \ enc "cbc(aes)" "azertyuiopqsdfgh" ip xfrm state add src 2.2.2.2 dst 1.1.1.1 proto esp spi 2000 mode tunnel \ enc "cbc(aes)" "sqbdhgqsdjqjsdfh" To avoid matching "global" (not vti) SPs in vti interfaces, global SPs should no use the default wildcard mark, but explicitly match mark 0. To avoid a double SPD lookup in input and output (in global and vti SPDs), the NOPOLICY and NOXFRM options should be set on the vti interfaces: echo 1 > /proc/sys/net/ipv4/conf/vti1/disable_policy echo 1 > /proc/sys/net/ipv4/conf/vti1/disable_xfrm The outgoing traffic is steered to vti1 by a route via the vti interface: ip route add 192.168.0.0/16 dev vti1 The incoming IPsec traffic is steered to vti1 because its outer addresses match the vti1 tunnel configuration. Signed-off-by: Christophe Gouault Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/ipv4/ip_vti.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ccf343b1c3d..d011525e97fe 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -323,8 +323,17 @@ static int vti_rcv(struct sk_buff *skb) tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); if (tunnel != NULL) { struct pcpu_tstats *tstats; + u32 oldmark = skb->mark; + int ret; - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + + /* temporarily mark the skb with the tunnel o_key, to + * only match policies with this mark. + */ + skb->mark = be32_to_cpu(tunnel->parms.o_key); + ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb); + skb->mark = oldmark; + if (!ret) return -1; tstats = this_cpu_ptr(tunnel->dev->tstats); @@ -333,7 +342,6 @@ static int vti_rcv(struct sk_buff *skb) tstats->rx_bytes += skb->len; u64_stats_update_end(&tstats->syncp); - skb->mark = 0; secpath_reset(skb); skb->dev = tunnel->dev; return 1; @@ -365,7 +373,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl4, 0, sizeof(fl4)); flowi4_init_output(&fl4, tunnel->parms.link, - be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos), + be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPIP, 0, dst, tiph->saddr, 0, 0); From a7a8e3f795261b42cda190e13cfffcc872248747 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2013 06:30:09 -0700 Subject: [PATCH 26/94] l2tp: must disable bh before calling l2tp_xmit_skb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 455cc32bf128e114455d11ad919321ab89a2c312 ] François Cachereul made a very nice bug report and suspected the bh_lock_sock() / bh_unlok_sock() pair used in l2tp_xmit_skb() from process context was not good. This problem was added by commit 6af88da14ee284aaad6e4326da09a89191ab6165 ("l2tp: Fix locking in l2tp_core.c"). l2tp_eth_dev_xmit() runs from BH context, so we must disable BH from other l2tp_xmit_skb() users. [ 452.060011] BUG: soft lockup - CPU#1 stuck for 23s! [accel-pppd:6662] [ 452.061757] Modules linked in: l2tp_ppp l2tp_netlink l2tp_core pppoe pppox ppp_generic slhc ipv6 ext3 mbcache jbd virtio_balloon xfs exportfs dm_mod virtio_blk ata_generic virtio_net floppy ata_piix libata virtio_pci virtio_ring virtio [last unloaded: scsi_wait_scan] [ 452.064012] CPU 1 [ 452.080015] BUG: soft lockup - CPU#2 stuck for 23s! [accel-pppd:6643] [ 452.080015] CPU 2 [ 452.080015] [ 452.080015] Pid: 6643, comm: accel-pppd Not tainted 3.2.46.mini #1 Bochs Bochs [ 452.080015] RIP: 0010:[] [] do_raw_spin_lock+0x17/0x1f [ 452.080015] RSP: 0018:ffff88007125fc18 EFLAGS: 00000293 [ 452.080015] RAX: 000000000000aba9 RBX: ffffffff811d0703 RCX: 0000000000000000 [ 452.080015] RDX: 00000000000000ab RSI: ffff8800711f6896 RDI: ffff8800745c8110 [ 452.080015] RBP: ffff88007125fc18 R08: 0000000000000020 R09: 0000000000000000 [ 452.080015] R10: 0000000000000000 R11: 0000000000000280 R12: 0000000000000286 [ 452.080015] R13: 0000000000000020 R14: 0000000000000240 R15: 0000000000000000 [ 452.080015] FS: 00007fdc0cc24700(0000) GS:ffff8800b6f00000(0000) knlGS:0000000000000000 [ 452.080015] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 452.080015] CR2: 00007fdb054899b8 CR3: 0000000074404000 CR4: 00000000000006a0 [ 452.080015] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 452.080015] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 452.080015] Process accel-pppd (pid: 6643, threadinfo ffff88007125e000, task ffff8800b27e6dd0) [ 452.080015] Stack: [ 452.080015] ffff88007125fc28 ffffffff81256559 ffff88007125fc98 ffffffffa01b2bd1 [ 452.080015] ffff88007125fc58 000000000000000c 00000000029490d0 0000009c71dbe25e [ 452.080015] 000000000000005c 000000080000000e 0000000000000000 ffff880071170600 [ 452.080015] Call Trace: [ 452.080015] [] _raw_spin_lock+0xe/0x10 [ 452.080015] [] l2tp_xmit_skb+0x189/0x4ac [l2tp_core] [ 452.080015] [] pppol2tp_sendmsg+0x15e/0x19c [l2tp_ppp] [ 452.080015] [] __sock_sendmsg_nosec+0x22/0x24 [ 452.080015] [] sock_sendmsg+0xa1/0xb6 [ 452.080015] [] ? __schedule+0x5c1/0x616 [ 452.080015] [] ? __dequeue_signal+0xb7/0x10c [ 452.080015] [] ? fget_light+0x75/0x89 [ 452.080015] [] ? sockfd_lookup_light+0x20/0x56 [ 452.080015] [] sys_sendto+0x10c/0x13b [ 452.080015] [] system_call_fastpath+0x16/0x1b [ 452.080015] Code: 81 48 89 e5 72 0c 31 c0 48 81 ff 45 66 25 81 0f 92 c0 5d c3 55 b8 00 01 00 00 48 89 e5 f0 66 0f c1 07 0f b6 d4 38 d0 74 06 f3 90 <8a> 07 eb f6 5d c3 90 90 55 48 89 e5 9c 58 0f 1f 44 00 00 5d c3 [ 452.080015] Call Trace: [ 452.080015] [] _raw_spin_lock+0xe/0x10 [ 452.080015] [] l2tp_xmit_skb+0x189/0x4ac [l2tp_core] [ 452.080015] [] pppol2tp_sendmsg+0x15e/0x19c [l2tp_ppp] [ 452.080015] [] __sock_sendmsg_nosec+0x22/0x24 [ 452.080015] [] sock_sendmsg+0xa1/0xb6 [ 452.080015] [] ? __schedule+0x5c1/0x616 [ 452.080015] [] ? __dequeue_signal+0xb7/0x10c [ 452.080015] [] ? fget_light+0x75/0x89 [ 452.080015] [] ? sockfd_lookup_light+0x20/0x56 [ 452.080015] [] sys_sendto+0x10c/0x13b [ 452.080015] [] system_call_fastpath+0x16/0x1b [ 452.064012] [ 452.064012] Pid: 6662, comm: accel-pppd Not tainted 3.2.46.mini #1 Bochs Bochs [ 452.064012] RIP: 0010:[] [] do_raw_spin_lock+0x19/0x1f [ 452.064012] RSP: 0018:ffff8800b6e83ba0 EFLAGS: 00000297 [ 452.064012] RAX: 000000000000aaa9 RBX: ffff8800b6e83b40 RCX: 0000000000000002 [ 452.064012] RDX: 00000000000000aa RSI: 000000000000000a RDI: ffff8800745c8110 [ 452.064012] RBP: ffff8800b6e83ba0 R08: 000000000000c802 R09: 000000000000001c [ 452.064012] R10: ffff880071096c4e R11: 0000000000000006 R12: ffff8800b6e83b18 [ 452.064012] R13: ffffffff8125d51e R14: ffff8800b6e83ba0 R15: ffff880072a589c0 [ 452.064012] FS: 00007fdc0b81e700(0000) GS:ffff8800b6e80000(0000) knlGS:0000000000000000 [ 452.064012] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 452.064012] CR2: 0000000000625208 CR3: 0000000074404000 CR4: 00000000000006a0 [ 452.064012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 452.064012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 452.064012] Process accel-pppd (pid: 6662, threadinfo ffff88007129a000, task ffff8800744f7410) [ 452.064012] Stack: [ 452.064012] ffff8800b6e83bb0 ffffffff81256559 ffff8800b6e83bc0 ffffffff8121c64a [ 452.064012] ffff8800b6e83bf0 ffffffff8121ec7a ffff880072a589c0 ffff880071096c62 [ 452.064012] 0000000000000011 ffffffff81430024 ffff8800b6e83c80 ffffffff8121f276 [ 452.064012] Call Trace: [ 452.064012] [ 452.064012] [] _raw_spin_lock+0xe/0x10 [ 452.064012] [] spin_lock+0x9/0xb [ 452.064012] [] udp_queue_rcv_skb+0x186/0x269 [ 452.064012] [] __udp4_lib_rcv+0x297/0x4ae [ 452.064012] [] ? raw_rcv+0xe9/0xf0 [ 452.064012] [] udp_rcv+0x1a/0x1c [ 452.064012] [] ip_local_deliver_finish+0x12b/0x1a5 [ 452.064012] [] ip_local_deliver+0x53/0x84 [ 452.064012] [] ip_rcv_finish+0x2bc/0x2f3 [ 452.064012] [] ip_rcv+0x210/0x269 [ 452.064012] [] ? kvm_clock_get_cycles+0x9/0xb [ 452.064012] [] __netif_receive_skb+0x3a5/0x3f7 [ 452.064012] [] netif_receive_skb+0x57/0x5e [ 452.064012] [] ? __netdev_alloc_skb+0x1f/0x3b [ 452.064012] [] virtnet_poll+0x4ba/0x5a4 [virtio_net] [ 452.064012] [] net_rx_action+0x73/0x184 [ 452.064012] [] ? l2tp_xmit_skb+0x27a/0x4ac [l2tp_core] [ 452.064012] [] __do_softirq+0xc3/0x1a8 [ 452.064012] [] ? ack_APIC_irq+0x10/0x12 [ 452.064012] [] ? _raw_spin_lock+0xe/0x10 [ 452.064012] [] call_softirq+0x1c/0x26 [ 452.064012] [] do_softirq+0x45/0x82 [ 452.064012] [] irq_exit+0x42/0x9c [ 452.064012] [] do_IRQ+0x8e/0xa5 [ 452.064012] [] common_interrupt+0x6e/0x6e [ 452.064012] [ 452.064012] [] ? kfree+0x8a/0xa3 [ 452.064012] [] ? l2tp_xmit_skb+0x27a/0x4ac [l2tp_core] [ 452.064012] [] ? l2tp_xmit_skb+0x1dd/0x4ac [l2tp_core] [ 452.064012] [] pppol2tp_sendmsg+0x15e/0x19c [l2tp_ppp] [ 452.064012] [] __sock_sendmsg_nosec+0x22/0x24 [ 452.064012] [] sock_sendmsg+0xa1/0xb6 [ 452.064012] [] ? __schedule+0x5c1/0x616 [ 452.064012] [] ? __dequeue_signal+0xb7/0x10c [ 452.064012] [] ? fget_light+0x75/0x89 [ 452.064012] [] ? sockfd_lookup_light+0x20/0x56 [ 452.064012] [] sys_sendto+0x10c/0x13b [ 452.064012] [] system_call_fastpath+0x16/0x1b [ 452.064012] Code: 89 e5 72 0c 31 c0 48 81 ff 45 66 25 81 0f 92 c0 5d c3 55 b8 00 01 00 00 48 89 e5 f0 66 0f c1 07 0f b6 d4 38 d0 74 06 f3 90 8a 07 f6 5d c3 90 90 55 48 89 e5 9c 58 0f 1f 44 00 00 5d c3 55 48 [ 452.064012] Call Trace: [ 452.064012] [] _raw_spin_lock+0xe/0x10 [ 452.064012] [] spin_lock+0x9/0xb [ 452.064012] [] udp_queue_rcv_skb+0x186/0x269 [ 452.064012] [] __udp4_lib_rcv+0x297/0x4ae [ 452.064012] [] ? raw_rcv+0xe9/0xf0 [ 452.064012] [] udp_rcv+0x1a/0x1c [ 452.064012] [] ip_local_deliver_finish+0x12b/0x1a5 [ 452.064012] [] ip_local_deliver+0x53/0x84 [ 452.064012] [] ip_rcv_finish+0x2bc/0x2f3 [ 452.064012] [] ip_rcv+0x210/0x269 [ 452.064012] [] ? kvm_clock_get_cycles+0x9/0xb [ 452.064012] [] __netif_receive_skb+0x3a5/0x3f7 [ 452.064012] [] netif_receive_skb+0x57/0x5e [ 452.064012] [] ? __netdev_alloc_skb+0x1f/0x3b [ 452.064012] [] virtnet_poll+0x4ba/0x5a4 [virtio_net] [ 452.064012] [] net_rx_action+0x73/0x184 [ 452.064012] [] ? l2tp_xmit_skb+0x27a/0x4ac [l2tp_core] [ 452.064012] [] __do_softirq+0xc3/0x1a8 [ 452.064012] [] ? ack_APIC_irq+0x10/0x12 [ 452.064012] [] ? _raw_spin_lock+0xe/0x10 [ 452.064012] [] call_softirq+0x1c/0x26 [ 452.064012] [] do_softirq+0x45/0x82 [ 452.064012] [] irq_exit+0x42/0x9c [ 452.064012] [] do_IRQ+0x8e/0xa5 [ 452.064012] [] common_interrupt+0x6e/0x6e [ 452.064012] [] ? kfree+0x8a/0xa3 [ 452.064012] [] ? l2tp_xmit_skb+0x27a/0x4ac [l2tp_core] [ 452.064012] [] ? l2tp_xmit_skb+0x1dd/0x4ac [l2tp_core] [ 452.064012] [] pppol2tp_sendmsg+0x15e/0x19c [l2tp_ppp] [ 452.064012] [] __sock_sendmsg_nosec+0x22/0x24 [ 452.064012] [] sock_sendmsg+0xa1/0xb6 [ 452.064012] [] ? __schedule+0x5c1/0x616 [ 452.064012] [] ? __dequeue_signal+0xb7/0x10c [ 452.064012] [] ? fget_light+0x75/0x89 [ 452.064012] [] ? sockfd_lookup_light+0x20/0x56 [ 452.064012] [] sys_sendto+0x10c/0x13b [ 452.064012] [] system_call_fastpath+0x16/0x1b Reported-by: François Cachereul Tested-by: François Cachereul Signed-off-by: Eric Dumazet Cc: James Chapman Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/l2tp/l2tp_ppp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index c3c11528c5e4..6920a6fbf25b 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -352,7 +352,9 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh goto error_put_sess_tun; } + local_bh_disable(); l2tp_xmit_skb(session, skb, session->hdr_len); + local_bh_enable(); sock_put(ps->tunnel_sock); sock_put(sk); @@ -421,7 +423,9 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) skb->data[0] = ppph[0]; skb->data[1] = ppph[1]; + local_bh_disable(); l2tp_xmit_skb(session, skb, session->hdr_len); + local_bh_enable(); sock_put(sk_tun); sock_put(sk); From 3c69fb47b36fa13b4a09e204aaf9029e3f0bb275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Salva=20Peir=C3=B3?= Date: Fri, 11 Oct 2013 12:50:03 +0300 Subject: [PATCH 27/94] farsync: fix info leak in ioctl [ Upstream commit 96b340406724d87e4621284ebac5e059d67b2194 ] The fst_get_iface() code fails to initialize the two padding bytes of struct sync_serial_settings after the ->loopback member. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/wan/farsync.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c index 56941d6547eb..9d6082fc29ff 100644 --- a/drivers/net/wan/farsync.c +++ b/drivers/net/wan/farsync.c @@ -1972,6 +1972,7 @@ fst_get_iface(struct fst_card_info *card, struct fst_port_info *port, } i = port->index; + memset(&sync, 0, sizeof(sync)); sync.clock_rate = FST_RDL(card, portConfig[i].lineSpeed); /* Lucky card and linux use same encoding here */ sync.clock_type = FST_RDB(card, portConfig[i].internalClock) == From 2eda274d4976bf1cd7af9343cd24f9e85318a4f3 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:05:40 +0200 Subject: [PATCH 28/94] unix_diag: fix info leak [ Upstream commit 6865d1e834be84ddd5808d93d5035b492346c64a ] When filling the netlink message we miss to wipe the pad field, therefore leak one byte of heap memory to userland. Fix this by setting pad to 0. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/unix/diag.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/unix/diag.c b/net/unix/diag.c index 5ac19dc1d5e4..ff4f29b3ef2e 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -124,6 +124,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r rep->udiag_family = AF_UNIX; rep->udiag_type = sk->sk_type; rep->udiag_state = sk->sk_state; + rep->pad = 0; rep->udiag_ino = sk_ino; sock_diag_save_cookie(sk, rep->udiag_cookie); From f65fc6c5e2eecf8052fec3fdbec0d733260ccbdc Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:03:07 +0200 Subject: [PATCH 29/94] connector: use nlmsg_len() to check message length [ Upstream commit 162b2bedc084d2d908a04c93383ba02348b648b0 ] The current code tests the length of the whole netlink message to be at least as long to fit a cn_msg. This is wrong as nlmsg_len includes the length of the netlink message header. Use nlmsg_len() instead to fix this "off-by-NLMSG_HDRLEN" size check. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- drivers/connector/connector.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 7b695913cb30..3050b7e4dff5 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -157,17 +157,18 @@ static int cn_call_callback(struct sk_buff *skb) static void cn_rx_skb(struct sk_buff *__skb) { struct nlmsghdr *nlh; - int err; struct sk_buff *skb; + int len, err; skb = skb_get(__skb); if (skb->len >= NLMSG_SPACE(0)) { nlh = nlmsg_hdr(skb); + len = nlmsg_len(nlh); - if (nlh->nlmsg_len < sizeof(struct cn_msg) || + if (len < (int)sizeof(struct cn_msg) || skb->len < nlh->nlmsg_len || - nlh->nlmsg_len > CONNECTOR_MAX_MSG_SIZE) { + len > CONNECTOR_MAX_MSG_SIZE) { kfree_skb(skb); return; } From ee42dca9358c110c126d4b1ff1e1d0e427911364 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 15 Oct 2013 11:18:58 +0800 Subject: [PATCH 30/94] virtio-net: don't respond to cpu hotplug notifier if we're not ready [ Upstream commit 3ab098df35f8b98b6553edc2e40234af512ba877 ] We're trying to re-configure the affinity unconditionally in cpu hotplug callback. This may lead the issue during resuming from s3/s4 since - virt queues haven't been allocated at that time. - it's unnecessary since thaw method will re-configure the affinity. Fix this issue by checking the config_enable and do nothing is we're not ready. The bug were introduced by commit 8de4b2f3ae90c8fc0f17eeaab87d5a951b66ee17 (virtio-net: reset virtqueue affinity when doing cpu hotplug). Cc: Rusty Russell Cc: Michael S. Tsirkin Cc: Wanlong Gao Acked-by: Michael S. Tsirkin Reviewed-by: Wanlong Gao Signed-off-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/virtio_net.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 35c00c5ea02a..848d26c462e8 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1076,6 +1076,11 @@ static int virtnet_cpu_callback(struct notifier_block *nfb, { struct virtnet_info *vi = container_of(nfb, struct virtnet_info, nb); + mutex_lock(&vi->config_lock); + + if (!vi->config_enable) + goto done; + switch(action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: @@ -1088,6 +1093,9 @@ static int virtnet_cpu_callback(struct notifier_block *nfb, default: break; } + +done: + mutex_unlock(&vi->config_lock); return NOTIFY_OK; } From 2b9b1f8abe82f54c63322404fe74c3f4a7c3d6e2 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 15 Oct 2013 14:57:45 -0400 Subject: [PATCH 31/94] bridge: Correctly clamp MAX forward_delay when enabling STP [ Upstream commit 4b6c7879d84ad06a2ac5b964808ed599187a188d ] Commit be4f154d5ef0ca147ab6bcd38857a774133f5450 bridge: Clamp forward_delay when enabling STP had a typo when attempting to clamp maximum forward delay. It is possible to set bridge_forward_delay to be higher then permitted maximum when STP is off. When turning STP on, the higher then allowed delay has to be clamed down to max value. CC: Herbert Xu CC: Stephen Hemminger Signed-off-by: Vlad Yasevich Reviewed-by: Veaceslav Falico Acked-by: Herbert Xu Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/bridge/br_stp_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 7ba2ed5c70fd..5bb38cd70daf 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -134,7 +134,7 @@ static void br_stp_start(struct net_bridge *br) if (br->bridge_forward_delay < BR_MIN_FORWARD_DELAY) __br_set_forward_delay(br, BR_MIN_FORWARD_DELAY); - else if (br->bridge_forward_delay < BR_MAX_FORWARD_DELAY) + else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY) __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY); if (r == 0) { From 7af12e773fa6f1460109abbcbbd8e28589cdb0a6 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 15 Oct 2013 22:01:29 -0400 Subject: [PATCH 32/94] net: dst: provide accessor function to dst->xfrm [ Upstream commit e87b3998d795123b4139bc3f25490dd236f68212 ] dst->xfrm is conditionally defined. Provide accessor funtion that is always available. Signed-off-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- include/net/dst.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/net/dst.h b/include/net/dst.h index b3ebe1780b41..1cdf9a4a6def 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -474,10 +474,22 @@ static inline struct dst_entry *xfrm_lookup(struct net *net, { return dst_orig; } + +static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) +{ + return NULL; +} + #else extern struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, struct sock *sk, int flags); + +/* skb attached with this dst needs transformation if dst->xfrm is valid */ +static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) +{ + return dst->xfrm; +} #endif #endif /* _NET_DST_H */ From 4eefb148defadc2ed4a325edd0209f790ae4e52c Mon Sep 17 00:00:00 2001 From: Fan Du Date: Tue, 15 Oct 2013 22:01:30 -0400 Subject: [PATCH 33/94] sctp: Use software crc32 checksum when xfrm transform will happen. [ Upstream commit 27127a82561a2a3ed955ce207048e1b066a80a2a ] igb/ixgbe have hardware sctp checksum support, when this feature is enabled and also IPsec is armed to protect sctp traffic, ugly things happened as xfrm_output checks CHECKSUM_PARTIAL to do checksum operation(sum every thing up and pack the 16bits result in the checksum field). The result is fail establishment of sctp communication. Cc: Neil Horman Cc: Steffen Klassert Signed-off-by: Fan Du Signed-off-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/sctp/output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/output.c b/net/sctp/output.c index f5200a2ad852..0c04637fe45c 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -550,7 +550,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) * by CRC32-C as described in . */ if (!sctp_checksum_disable) { - if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) { + if (!(dst->dev->features & NETIF_F_SCTP_CSUM) || + (dst_xfrm(dst) != NULL)) { __u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len); /* 3) Put the resultant value into the checksum field in the From 8229ca049e4679e12d93356fa936060deaa430cb Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 15 Oct 2013 22:01:31 -0400 Subject: [PATCH 34/94] sctp: Perform software checksum if packet has to be fragmented. [ Upstream commit d2dbbba77e95dff4b4f901fee236fef6d9552072 ] IP/IPv6 fragmentation knows how to compute only TCP/UDP checksum. This causes problems if SCTP packets has to be fragmented and ipsummed has been set to PARTIAL due to checksum offload support. This condition can happen when retransmitting after MTU discover, or when INIT or other control chunks are larger then MTU. Check for the rare fragmentation condition in SCTP and use software checksum calculation in this case. CC: Fan Du Signed-off-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/sctp/output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/output.c b/net/sctp/output.c index 0c04637fe45c..d111b636757a 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -551,7 +551,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) */ if (!sctp_checksum_disable) { if (!(dst->dev->features & NETIF_F_SCTP_CSUM) || - (dst_xfrm(dst) != NULL)) { + (dst_xfrm(dst) != NULL) || packet->ipfragok) { __u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len); /* 3) Put the resultant value into the checksum field in the From 0327df3007da08bf208a958c5f555ec7bff70586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Salva=20Peir=C3=B3?= Date: Wed, 16 Oct 2013 12:46:50 +0200 Subject: [PATCH 35/94] wanxl: fix info leak in ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 2b13d06c9584b4eb773f1e80bbaedab9a1c344e1 ] The wanxl_ioctl() code fails to initialize the two padding bytes of struct sync_serial_settings after the ->loopback member. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Salva Peiró Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/wan/wanxl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c index 6a24a5a70cc7..4c0a69779b89 100644 --- a/drivers/net/wan/wanxl.c +++ b/drivers/net/wan/wanxl.c @@ -355,6 +355,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ifr->ifr_settings.size = size; /* data size wanted */ return -ENOBUFS; } + memset(&line, 0, sizeof(line)); line.clock_type = get_status(port)->clocking; line.clock_rate = 0; line.loopback = 0; From ba0f0189f730e43ec33d719a4f692c95a54cac8a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Oct 2013 22:51:31 +0200 Subject: [PATCH 36/94] net: unix: inherit SOCK_PASS{CRED, SEC} flags from socket to fix race [ Upstream commit 90c6bd34f884cd9cee21f1d152baf6c18bcac949 ] In the case of credentials passing in unix stream sockets (dgram sockets seem not affected), we get a rather sparse race after commit 16e5726 ("af_unix: dont send SCM_CREDENTIALS by default"). We have a stream server on receiver side that requests credential passing from senders (e.g. nc -U). Since we need to set SO_PASSCRED on each spawned/accepted socket on server side to 1 first (as it's not inherited), it can happen that in the time between accept() and setsockopt() we get interrupted, the sender is being scheduled and continues with passing data to our receiver. At that time SO_PASSCRED is neither set on sender nor receiver side, hence in cmsg's SCM_CREDENTIALS we get eventually pid:0, uid:65534, gid:65534 (== overflow{u,g}id) instead of what we actually would like to see. On the sender side, here nc -U, the tests in maybe_add_creds() invoked through unix_stream_sendmsg() would fail, as at that exact time, as mentioned, the sender has neither SO_PASSCRED on his side nor sees it on the server side, and we have a valid 'other' socket in place. Thus, sender believes it would just look like a normal connection, not needing/requesting SO_PASSCRED at that time. As reverting 16e5726 would not be an option due to the significant performance regression reported when having creds always passed, one way/trade-off to prevent that would be to set SO_PASSCRED on the listener socket and allow inheriting these flags to the spawned socket on server side in accept(). It seems also logical to do so if we'd tell the listener socket to pass those flags onwards, and would fix the race. Before, strace: recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}], msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=0, uid=65534, gid=65534}}, msg_flags=0}, 0) = 5 After, strace: recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}], msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=11580, uid=1000, gid=1000}}, msg_flags=0}, 0) = 5 Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Eric W. Biederman Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- net/unix/af_unix.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f347754e4625..4f4095999bac 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1247,6 +1247,15 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) return 0; } +static void unix_sock_inherit_flags(const struct socket *old, + struct socket *new) +{ + if (test_bit(SOCK_PASSCRED, &old->flags)) + set_bit(SOCK_PASSCRED, &new->flags); + if (test_bit(SOCK_PASSSEC, &old->flags)) + set_bit(SOCK_PASSSEC, &new->flags); +} + static int unix_accept(struct socket *sock, struct socket *newsock, int flags) { struct sock *sk = sock->sk; @@ -1281,6 +1290,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) /* attach accepted sock to socket */ unix_state_lock(tsk); newsock->state = SS_CONNECTED; + unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; From 3940797dcb95fa8876998f5059c7b65ffa838bbb Mon Sep 17 00:00:00 2001 From: Seif Mazareeb Date: Thu, 17 Oct 2013 20:33:21 -0700 Subject: [PATCH 37/94] net: fix cipso packet validation when !NETLABEL [ Upstream commit f2e5ddcc0d12f9c4c7b254358ad245c9dddce13b ] When CONFIG_NETLABEL is disabled, the cipso_v4_validate() function could loop forever in the main loop if opt[opt_iter +1] == 0, this will causing a kernel crash in an SMP system, since the CPU executing this function will stall /not respond to IPIs. This problem can be reproduced by running the IP Stack Integrity Checker (http://isic.sourceforge.net) using the following command on a Linux machine connected to DUT: "icmpsic -s rand -d -r 123456" wait (1-2 min) Signed-off-by: Seif Mazareeb Acked-by: Paul Moore Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- include/net/cipso_ipv4.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h index a7a683e30b64..a8c2ef6d3b93 100644 --- a/include/net/cipso_ipv4.h +++ b/include/net/cipso_ipv4.h @@ -290,6 +290,7 @@ static inline int cipso_v4_validate(const struct sk_buff *skb, unsigned char err_offset = 0; u8 opt_len = opt[1]; u8 opt_iter; + u8 tag_len; if (opt_len < 8) { err_offset = 1; @@ -302,11 +303,12 @@ static inline int cipso_v4_validate(const struct sk_buff *skb, } for (opt_iter = 6; opt_iter < opt_len;) { - if (opt[opt_iter + 1] > (opt_len - opt_iter)) { + tag_len = opt[opt_iter + 1]; + if ((tag_len == 0) || (opt[opt_iter + 1] > (opt_len - opt_iter))) { err_offset = opt_iter + 1; goto out; } - opt_iter += opt[opt_iter + 1]; + opt_iter += tag_len; } out: From 6c114a05eee43f08e1f74258f37d8849b00e8471 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 22 Oct 2013 00:07:47 +0200 Subject: [PATCH 38/94] inet: fix possible memory corruption with UDP_CORK and UFO [ This is a simplified -stable version of a set of upstream commits. ] This is a replacement patch only for stable which does fix the problems handled by the following two commits in -net: "ip_output: do skb ufo init for peeked non ufo skb as well" (e93b7d748be887cd7639b113ba7d7ef792a7efb9) "ip6_output: do skb ufo init for peeked non ufo skb as well" (c547dbf55d5f8cf615ccc0e7265e98db27d3fb8b) Three frames are written on a corked udp socket for which the output netdevice has UFO enabled. If the first and third frame are smaller than the mtu and the second one is bigger, we enqueue the second frame with skb_append_datato_frags without initializing the gso fields. This leads to the third frame appended regulary and thus constructing an invalid skb. This fixes the problem by always using skb_append_datato_frags as soon as the first frag got enqueued to the skb without marking the packet as SKB_GSO_UDP. The problem with only two frames for ipv6 was fixed by "ipv6: udp packets following an UFO enqueued packet need also be handled by UFO" (2811ebac2521ceac84f2bdae402455baa6a7fb47). Cc: Jiri Pirko Cc: Eric Dumazet Cc: David Miller Signed-off-by: Hannes Frederic Sowa Signed-off-by: Kamal Mostafa --- include/linux/skbuff.h | 5 +++++ net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 24f93a104033..db29f787fcfb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1252,6 +1252,11 @@ static inline int skb_pagelen(const struct sk_buff *skb) return len + skb_headlen(skb); } +static inline bool skb_has_frags(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->nr_frags; +} + /** * __skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 11b2d78f4edd..fd5720b50e0a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -844,7 +844,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if (((length > mtu) || (skb && skb_is_gso(skb))) && + if (((length > mtu) || (skb && skb_has_frags(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3ae7a5da0844..f477a89fad24 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1271,7 +1271,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, skb = skb_peek_tail(&sk->sk_write_queue); cork->length += length; if (((length > mtu) || - (skb && skb_is_gso(skb))) && + (skb && skb_has_frags(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { err = ip6_ufo_append_data(sk, getfrag, from, length, From 06683e44d447f02e86a1bb9bfadc339003df339f Mon Sep 17 00:00:00 2001 From: Mariusz Ceier Date: Mon, 21 Oct 2013 19:45:04 +0200 Subject: [PATCH 39/94] davinci_emac.c: Fix IFF_ALLMULTI setup [ Upstream commit d69e0f7ea95fef8059251325a79c004bac01f018 ] When IFF_ALLMULTI flag is set on interface and IFF_PROMISC isn't, emac_dev_mcast_set should only enable RX of multicasts and reset MACHASH registers. It does this, but afterwards it either sets up multicast MACs filtering or disables RX of multicasts and resets MACHASH registers again, rendering IFF_ALLMULTI flag useless. This patch fixes emac_dev_mcast_set, so that multicast MACs filtering and disabling of RX of multicasts are skipped when IFF_ALLMULTI flag is set. Tested with kernel 2.6.37. Signed-off-by: Mariusz Ceier Acked-by: Mugunthan V N Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/ethernet/ti/davinci_emac.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 4ebcb241290f..17a30285eb37 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -878,8 +878,7 @@ static void emac_dev_mcast_set(struct net_device *ndev) netdev_mc_count(ndev) > EMAC_DEF_MAX_MULTICAST_ADDRESSES) { mbp_enable = (mbp_enable | EMAC_MBP_RXMCAST); emac_add_mcast(priv, EMAC_ALL_MULTI_SET, NULL); - } - if (!netdev_mc_empty(ndev)) { + } else if (!netdev_mc_empty(ndev)) { struct netdev_hw_addr *ha; mbp_enable = (mbp_enable | EMAC_MBP_RXMCAST); From db7b6fd0b45f9b070046722a0d932a78ec521739 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Fri, 6 Sep 2013 21:49:56 -0500 Subject: [PATCH 40/94] jfs: fix error path in ialloc commit 8660998608cfa1077e560034db81885af8e1e885 upstream. If insert_inode_locked() fails, we shouldn't be calling unlock_new_inode(). Signed-off-by: Dave Kleikamp Tested-by: Michael L. Semon Signed-off-by: Kamal Mostafa --- fs/jfs/jfs_inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index c1a3e603279c..7f464c513ba0 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -95,7 +95,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (insert_inode_locked(inode) < 0) { rc = -EINVAL; - goto fail_unlock; + goto fail_put; } inode_init_owner(inode, parent, mode); @@ -156,7 +156,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; -fail_unlock: clear_nlink(inode); unlock_new_inode(inode); fail_put: From ac702da265099536f1363c381bec348432a1064d Mon Sep 17 00:00:00 2001 From: Bruno Randolf Date: Thu, 26 Sep 2013 16:55:28 +0100 Subject: [PATCH 41/94] cfg80211: fix warning when using WEXT for IBSS commit f478f33a93f9353dcd1fe55445343d76b1c3f84a upstream. Fix kernel warning when using WEXT for configuring ad-hoc mode, e.g. "iwconfig wlan0 essid test channel 1" WARNING: at net/wireless/chan.c:373 cfg80211_chandef_usable+0x50/0x21c [cfg80211]() The warning is caused by an uninitialized variable center_freq1. Signed-off-by: Bruno Randolf Signed-off-by: Johannes Berg Signed-off-by: Kamal Mostafa --- net/wireless/ibss.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 9b9551e4a6f9..ceef8543c110 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -269,6 +269,8 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, if (chan->flags & IEEE80211_CHAN_DISABLED) continue; wdev->wext.ibss.chandef.chan = chan; + wdev->wext.ibss.chandef.center_freq1 = + chan->center_freq; break; } @@ -353,6 +355,7 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev, if (chan) { wdev->wext.ibss.chandef.chan = chan; wdev->wext.ibss.chandef.width = NL80211_CHAN_WIDTH_20_NOHT; + wdev->wext.ibss.chandef.center_freq1 = freq; wdev->wext.ibss.channel_fixed = true; } else { /* cfg80211_ibss_wext_join will pick one if needed */ From 43dab4a38afc1ddfe8efe13c25ee3aed1f2b91aa Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 17 Sep 2013 11:15:43 +0200 Subject: [PATCH 42/94] mac80211: drop spoofed packets in ad-hoc mode commit 6329b8d917adc077caa60c2447385554130853a3 upstream. If an Ad-Hoc node receives packets with the Cell ID or its own MAC address as source address, it hits a WARN_ON in sta_info_insert_check() With many packets, this can massively spam the logs. One way that this can easily happen is through having Cisco APs in the area with rouge AP detection and countermeasures enabled. Such Cisco APs will regularly send fake beacons, disassoc and deauth packets that trigger these warnings. To fix this issue, drop such spoofed packets early in the rx path. Reported-by: Thomas Huehn Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg Signed-off-by: Kamal Mostafa --- net/mac80211/rx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 87f76fac7795..b1e5e038504b 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2916,6 +2916,9 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, case NL80211_IFTYPE_ADHOC: if (!bssid) return 0; + if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || + ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) + return 0; if (ieee80211_is_beacon(hdr->frame_control)) { return 1; } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { From 6db4bb5d191b5ea44cd8eb969aafabc652f3fef9 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 29 Sep 2013 21:39:33 +0200 Subject: [PATCH 43/94] mac80211: use sta_info_get_bss() for nl80211 tx and client probing commit 03bb7f42765ce596604f03d179f3137d7df05bba upstream. This allows calls for clients in AP_VLANs (e.g. for 4-addr) to succeed Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg Signed-off-by: Kamal Mostafa --- net/mac80211/cfg.c | 2 +- net/mac80211/tx.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 40732e39e154..a1df92b3e79f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3113,7 +3113,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, return -EINVAL; } band = chanctx_conf->def.chan->band; - sta = sta_info_get(sdata, peer); + sta = sta_info_get_bss(sdata, peer); if (sta) { qos = test_sta_flag(sta, WLAN_STA_WME); } else { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 467c1d1b66f2..6b976b79dc91 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1149,7 +1149,8 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, tx->sta = rcu_dereference(sdata->u.vlan.sta); if (!tx->sta && sdata->dev->ieee80211_ptr->use_4addr) return TX_DROP; - } else if (info->flags & IEEE80211_TX_CTL_INJECTED || + } else if (info->flags & (IEEE80211_TX_CTL_INJECTED | + IEEE80211_TX_INTFL_NL80211_FRAME_TX) || tx->sdata->control_port_protocol == tx->skb->protocol) { tx->sta = sta_info_get_bss(sdata, hdr->addr1); } From e843462cd07adda4813afe1e230a9c8f5b4b0792 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 29 Sep 2013 21:39:34 +0200 Subject: [PATCH 44/94] mac80211: update sta->last_rx on acked tx frames commit 0c5b93290b2f3c7a376567c03ae8d385b0e99851 upstream. When clients are idle for too long, hostapd sends nullfunc frames for probing. When those are acked by the client, the idle time needs to be updated. To make this work (and to avoid unnecessary probing), update sta->last_rx whenever an ACK was received for a tx packet. Only do this if the flag IEEE80211_HW_REPORTS_TX_ACK_STATUS is set. Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg Signed-off-by: Kamal Mostafa --- net/mac80211/status.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 07d99578a2b1..f3a289872084 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -180,6 +180,9 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; + if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + sta->last_rx = jiffies; + if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qc = ieee80211_get_qos_ctl(hdr); From 0056fe49893f11b9ebc49f30a434395fe2e3f85a Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Fri, 27 Sep 2013 10:55:38 -0700 Subject: [PATCH 45/94] mwifiex: fix SDIO interrupt lost issue commit 453b0c3f6910672f79da354077af728d92f95c5b upstream. 601216e "mwifiex: process RX packets in SDIO IRQ thread directly" introduced a command timeout issue which can be reproduced easily on an AM33xx platform using a test application written by Daniel Mack: https://gist.github.com/zonque/6579314 mwifiex_main_process() is called from both the SDIO handler and the workqueue. In case an interrupt occurs right after the int_status check, but before updating the mwifiex_processing flag, this interrupt gets lost, resulting in a command timeout and consequently a card reset. Let main_proc_lock protect both int_status and mwifiex_processing flag. This fixes the interrupt lost issue. Reported-by: Sven Neumann Reported-by: Andreas Fenkart Tested-by: Daniel Mack Reviewed-by: Dylan Reid Signed-off-by: Amitkumar Karwar Signed-off-by: Bing Zhao Signed-off-by: Paul Stewart Signed-off-by: John W. Linville Signed-off-by: Kamal Mostafa --- drivers/net/wireless/mwifiex/main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mwifiex/main.c b/drivers/net/wireless/mwifiex/main.c index 6d9bc63d6dda..a3d1a8950dc8 100644 --- a/drivers/net/wireless/mwifiex/main.c +++ b/drivers/net/wireless/mwifiex/main.c @@ -270,10 +270,12 @@ process_start: } } while (true); - if ((adapter->int_status) || IS_CARD_RX_RCVD(adapter)) - goto process_start; - spin_lock_irqsave(&adapter->main_proc_lock, flags); + if ((adapter->int_status) || IS_CARD_RX_RCVD(adapter)) { + spin_unlock_irqrestore(&adapter->main_proc_lock, flags); + goto process_start; + } + adapter->mwifiex_processing = false; spin_unlock_irqrestore(&adapter->main_proc_lock, flags); From e2e27fcc1287ce6ede1e7d283f3f91550f8b3552 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 5 Oct 2013 14:09:30 +0200 Subject: [PATCH 46/94] ath9k: fix tx queue scheduling after channel changes commit ec30326ea773900da210c495e14cfeb532550ba2 upstream. Otherwise, if queues are full during a scan, tx scheduling does not resume after switching back to the home channel. Signed-off-by: Felix Fietkau Signed-off-by: John W. Linville [ kamal: backport to 3.8 ] Signed-off-by: Kamal Mostafa --- drivers/net/wireless/ath/ath9k/main.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index fc8dc9394b67..e0eb87b73971 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -209,6 +209,7 @@ static bool ath_complete_reset(struct ath_softc *sc, bool start) struct ath_hw *ah = sc->sc_ah; struct ath_common *common = ath9k_hw_common(ah); unsigned long flags; + int i; if (ath_startrecv(sc) != 0) { ath_err(common, "Unable to restart recv logic\n"); @@ -236,6 +237,15 @@ static bool ath_complete_reset(struct ath_softc *sc, bool start) } work: ath_restart_work(sc); + + for (i = 0; i < ATH9K_NUM_TX_QUEUES; i++) { + if (!ATH_TXQ_SETUP(sc, i)) + continue; + + spin_lock_bh(&sc->tx.txq[i].axq_lock); + ath_txq_schedule(sc, &sc->tx.txq[i]); + spin_unlock_bh(&sc->tx.txq[i].axq_lock); + } } if ((ah->caps.hw_caps & ATH9K_HW_CAP_ANT_DIV_COMB) && sc->ant_rx != 3) @@ -546,20 +556,8 @@ static int ath_reset(struct ath_softc *sc, bool retry_tx) int r; ath9k_ps_wakeup(sc); - r = ath_reset_internal(sc, NULL, retry_tx); - if (retry_tx) { - int i; - for (i = 0; i < ATH9K_NUM_TX_QUEUES; i++) { - if (ATH_TXQ_SETUP(sc, i)) { - spin_lock_bh(&sc->tx.txq[i].axq_lock); - ath_txq_schedule(sc, &sc->tx.txq[i]); - spin_unlock_bh(&sc->tx.txq[i].axq_lock); - } - } - } - ath9k_ps_restore(sc); return r; From df318b1b5860933aa78bd7b4aaca239dd8973647 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 7 Aug 2009 16:17:49 -0700 Subject: [PATCH 47/94] libata: make ata_eh_qc_retry() bump scmd->allowed on bogus failures commit f13e220161e738c2710b9904dcb3cf8bb0bcce61 upstream. libata EH decrements scmd->retries when the command failed for reasons unrelated to the command itself so that, for example, commands aborted due to suspend / resume cycle don't get penalized; however, decrementing scmd->retries isn't enough for ATA passthrough commands. Without this fix, ATA passthrough commands are not resend to the drive, and no error is signalled to the caller because: - allowed retry count is 1 - ata_eh_qc_complete fill the sense data, so result is valid - sense data is filled with untouched ATA registers. Signed-off-by: Gwendal Grignou Signed-off-by: Tejun Heo Signed-off-by: Kamal Mostafa --- drivers/ata/libata-eh.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index bcf4437214f5..005e458e60a1 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -1322,14 +1322,14 @@ void ata_eh_qc_complete(struct ata_queued_cmd *qc) * should be retried. To be used from EH. * * SCSI midlayer limits the number of retries to scmd->allowed. - * scmd->retries is decremented for commands which get retried + * scmd->allowed is incremented for commands which get retried * due to unrelated failures (qc->err_mask is zero). */ void ata_eh_qc_retry(struct ata_queued_cmd *qc) { struct scsi_cmnd *scmd = qc->scsicmd; - if (!qc->err_mask && scmd->retries) - scmd->retries--; + if (!qc->err_mask) + scmd->allowed++; __ata_eh_qc_complete(qc); } From ddbb4b23dc8ac0bc82104124f77fd55d51fa95d6 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 16 Sep 2013 11:12:07 +0300 Subject: [PATCH 48/94] mac80211: correctly close cancelled scans commit a754055a1296fcbe6f32de3a5eaca6efb2fd1865 upstream. __ieee80211_scan_completed is called from a worker. This means that the following flow is possible. * driver calls ieee80211_scan_completed * mac80211 cancels the scan (that is already complete) * __ieee80211_scan_completed runs When scan_work will finally run, it will see that the scan hasn't been aborted and might even trigger another scan on another band. This leads to a situation where cfg80211's scan is not done and no further scan can be issued. Fix this by setting a new flag when a HW scan is being cancelled so that no other scan will be triggered. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg Signed-off-by: Kamal Mostafa --- net/mac80211/ieee80211_i.h | 3 +++ net/mac80211/scan.c | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 55d8f89bd581..abe9359df8f2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -861,6 +861,8 @@ struct tpt_led_trigger { * that the scan completed. * @SCAN_ABORTED: Set for our scan work function when the driver reported * a scan complete for an aborted scan. + * @SCAN_HW_CANCELLED: Set for our scan work function when the scan is being + * cancelled. */ enum { SCAN_SW_SCANNING, @@ -868,6 +870,7 @@ enum { SCAN_ONCHANNEL_SCANNING, SCAN_COMPLETED, SCAN_ABORTED, + SCAN_HW_CANCELLED, }; /** diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index bf82e69d0601..8d6ba43da210 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -215,6 +215,9 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) enum ieee80211_band band; int i, ielen, n_chans; + if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) + return false; + do { if (local->hw_scan_band == IEEE80211_NUM_BANDS) return false; @@ -903,7 +906,23 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) if (!local->scan_req) goto out; + /* + * We have a scan running and the driver already reported completion, + * but the worker hasn't run yet or is stuck on the mutex - mark it as + * cancelled. + */ + if (test_bit(SCAN_HW_SCANNING, &local->scanning) && + test_bit(SCAN_COMPLETED, &local->scanning)) { + set_bit(SCAN_HW_CANCELLED, &local->scanning); + goto out; + } + if (test_bit(SCAN_HW_SCANNING, &local->scanning)) { + /* + * Make sure that __ieee80211_scan_completed doesn't trigger a + * scan on another band. + */ + set_bit(SCAN_HW_CANCELLED, &local->scanning); if (local->ops->cancel_hw_scan) drv_cancel_hw_scan(local, rcu_dereference_protected(local->scan_sdata, From ac373d0426651f2b321a49e52d5ba92c5c68d978 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 3 Oct 2013 23:51:55 +0200 Subject: [PATCH 49/94] can: flexcan: fix mx28 detection by rearanging OF match table commit e358784297992b012e8071764d996191dd2b1a54 upstream. The current implemetation of of_match_device() relies that the of_device_id table in the driver is sorted from most specific to least specific compatible. Without this patch the mx28 is detected as the less specific p1010. This leads to a p1010 specific workaround is activated on the mx28, which is not needed. Signed-off-by: Marc Kleine-Budde Signed-off-by: Kamal Mostafa --- drivers/net/can/flexcan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 5817a3961543..4c90a27de6e6 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -968,9 +968,9 @@ static void unregister_flexcandev(struct net_device *dev) } static const struct of_device_id flexcan_of_match[] = { - { .compatible = "fsl,p1010-flexcan", .data = &fsl_p1010_devtype_data, }, - { .compatible = "fsl,imx28-flexcan", .data = &fsl_imx28_devtype_data, }, { .compatible = "fsl,imx6q-flexcan", .data = &fsl_imx6q_devtype_data, }, + { .compatible = "fsl,imx28-flexcan", .data = &fsl_imx28_devtype_data, }, + { .compatible = "fsl,p1010-flexcan", .data = &fsl_p1010_devtype_data, }, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, flexcan_of_match); From 35d6478f47c0ff6cb93b5121f1e05703ff850b9a Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Tue, 8 Oct 2013 10:18:20 -0500 Subject: [PATCH 50/94] rtlwifi: rtl8192cu: Fix error in pointer arithmetic commit 9473ca6e920a3b9ca902753ce52833657f9221cc upstream. An error in calculating the offset in an skb causes the driver to read essential device info from the wrong locations. The main effect is that automatic gain calculations are nonsense. Signed-off-by: Mark Cave-Ayland Signed-off-by: Larry Finger Signed-off-by: John W. Linville Signed-off-by: Kamal Mostafa --- drivers/net/wireless/rtlwifi/rtl8192cu/trx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/trx.c b/drivers/net/wireless/rtlwifi/rtl8192cu/trx.c index b6222eedb835..03b6d813da9f 100644 --- a/drivers/net/wireless/rtlwifi/rtl8192cu/trx.c +++ b/drivers/net/wireless/rtlwifi/rtl8192cu/trx.c @@ -343,7 +343,8 @@ bool rtl92cu_rx_query_desc(struct ieee80211_hw *hw, (bool)GET_RX_DESC_PAGGR(pdesc)); rx_status->mactime = GET_RX_DESC_TSFL(pdesc); if (phystatus) { - p_drvinfo = (struct rx_fwinfo_92c *)(pdesc + RTL_RX_DESC_SIZE); + p_drvinfo = (struct rx_fwinfo_92c *)(skb->data + + stats->rx_bufshift); rtl92c_translate_rx_signal_stuff(hw, skb, stats, pdesc, p_drvinfo); } From fac3e96df19f239dc7c464e1c0b0d2a22d2580a3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 11 Oct 2013 14:47:05 +0200 Subject: [PATCH 51/94] wireless: radiotap: fix parsing buffer overrun commit f5563318ff1bde15b10e736e97ffce13be08bc1a upstream. When parsing an invalid radiotap header, the parser can overrun the buffer that is passed in because it doesn't correctly check 1) the minimum radiotap header size 2) the space for extended bitmaps The first issue doesn't affect any in-kernel user as they all check the minimum size before calling the radiotap function. The second issue could potentially affect the kernel if an skb is passed in that consists only of the radiotap header with a lot of extended bitmaps that extend past the SKB. In that case a read-only buffer overrun by at most 4 bytes is possible. Fix this by adding the appropriate checks to the parser. Reported-by: Evan Huus Signed-off-by: Johannes Berg Signed-off-by: Kamal Mostafa --- net/wireless/radiotap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 7d604c06c3dc..a271c27fac77 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -97,6 +97,10 @@ int ieee80211_radiotap_iterator_init( struct ieee80211_radiotap_header *radiotap_header, int max_length, const struct ieee80211_radiotap_vendor_namespaces *vns) { + /* check the radiotap header can actually be present */ + if (max_length < sizeof(struct ieee80211_radiotap_header)) + return -EINVAL; + /* Linux only supports version 0 radiotap format */ if (radiotap_header->it_version) return -EINVAL; @@ -131,7 +135,8 @@ int ieee80211_radiotap_iterator_init( */ if ((unsigned long)iterator->_arg - - (unsigned long)iterator->_rtheader > + (unsigned long)iterator->_rtheader + + sizeof(uint32_t) > (unsigned long)iterator->_max_length) return -EINVAL; } From 20e95b44f8e2c20c7f8b2a3303e4637fc5cabf14 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 11 Oct 2013 15:47:06 +0200 Subject: [PATCH 52/94] mac80211: fix crash if bitrate calculation goes wrong commit d86aa4f8ca58898ec6a94c0635da20b948171ed7 upstream. If a frame's timestamp is calculated, and the bitrate calculation goes wrong and returns zero, the system will attempt to divide by zero and crash. Catch this case and print the rate information that the driver reported when this happens. Reported-by: Thomas Lindroth Signed-off-by: Johannes Berg Signed-off-by: Kamal Mostafa --- net/mac80211/util.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f11e8c540db4..e0ad72df99e1 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2105,6 +2105,10 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, } rate = cfg80211_calculate_bitrate(&ri); + if (WARN_ONCE(!rate, + "Invalid bitrate: flags=0x%x, idx=%d, vht_nss=%d\n", + status->flag, status->rate_idx, status->vht_nss)) + return 0; /* rewind from end of MPDU */ if (status->flag & RX_FLAG_MACTIME_END) From 86354bd5fff3436e59bd7aa217ea15d44d39e8e8 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Wed, 9 Oct 2013 01:42:50 -0700 Subject: [PATCH 53/94] drm/vmwgfx: Don't put resources with invalid id's on lru list commit 26682480c202e7360cbcdc3bc9e962bf749c6b8d upstream. The evict code may try to swap them out causing a BUG in the destroy function. Signed-off-by: Thomas Hellstrom Reviewed-by: Jakob Bornecrantz Signed-off-by: Dave Airlie Signed-off-by: Kamal Mostafa --- drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c index e01a17b407b2..64530290c642 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c @@ -971,7 +971,7 @@ void vmw_resource_unreserve(struct vmw_resource *res, if (new_backup) res->backup_offset = new_backup_offset; - if (!res->func->may_evict) + if (!res->func->may_evict || res->id == -1) return; write_lock(&dev_priv->resource_lock); From 1ab6437a04ef07519ed030ded0cbc41373c4c77b Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Wed, 9 Oct 2013 01:42:51 -0700 Subject: [PATCH 54/94] drm/vmwgfx: Don't kill clients on VT switch commit c4249855ac5b2a383704d31e040d3831d6a25c6f upstream. DRI clients that tried to grab the TTM lock when the master (X server) was switched away during a VT switch were sent the SIGTERM signal by the kernel. Fix this so that they are only sent that signal when the master has exited. Signed-off-by: Thomas Hellstrom Reviewed-by: Jakob Bornecrantz Signed-off-by: Dave Airlie Signed-off-by: Kamal Mostafa --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 161f8b2549aa..099d8d4d46ea 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -740,9 +740,17 @@ static void vmw_postclose(struct drm_device *dev, struct vmw_fpriv *vmw_fp; vmw_fp = vmw_fpriv(file_priv); - ttm_object_file_release(&vmw_fp->tfile); - if (vmw_fp->locked_master) + + if (vmw_fp->locked_master) { + struct vmw_master *vmaster = + vmw_master(vmw_fp->locked_master); + + ttm_lock_set_kill(&vmaster->lock, true, SIGTERM); + ttm_vt_unlock(&vmaster->lock); drm_master_put(&vmw_fp->locked_master); + } + + ttm_object_file_release(&vmw_fp->tfile); kfree(vmw_fp); } @@ -942,14 +950,13 @@ static void vmw_master_drop(struct drm_device *dev, vmw_fp->locked_master = drm_master_get(file_priv->master); ret = ttm_vt_lock(&vmaster->lock, false, vmw_fp->tfile); - vmw_execbuf_release_pinned_bo(dev_priv); - if (unlikely((ret != 0))) { DRM_ERROR("Unable to lock TTM at VT switch.\n"); drm_master_put(&vmw_fp->locked_master); } - ttm_lock_set_kill(&vmaster->lock, true, SIGTERM); + ttm_lock_set_kill(&vmaster->lock, false, SIGTERM); + vmw_execbuf_release_pinned_bo(dev_priv); if (!dev_priv->enable_fb) { ret = ttm_bo_evict_mm(&dev_priv->bdev, TTM_PL_VRAM); From 8f2ceddd30e0b10264ce73b70b7e8ae43f5c6ff1 Mon Sep 17 00:00:00 2001 From: "Geyslan G. Bem" Date: Fri, 11 Oct 2013 16:49:16 -0300 Subject: [PATCH 55/94] ecryptfs: Fix memory leakage in keystore.c commit 3edc8376c06133e3386265a824869cad03a4efd4 upstream. In 'decrypt_pki_encrypted_session_key' function: Initializes 'payload' pointer and releases it on exit. Signed-off-by: Geyslan G. Bem Signed-off-by: Tyler Hicks [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- fs/ecryptfs/keystore.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 2333203a120b..a26bb46ffc77 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1149,8 +1149,8 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_msg_ctx *msg_ctx; struct ecryptfs_message *msg = NULL; char *auth_tok_sig; - char *payload; - size_t payload_len; + char *payload = NULL; + size_t payload_len = 0; int rc; rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); @@ -1202,8 +1202,8 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, crypt_stat->key_size); } out: - if (msg) - kfree(msg); + kfree(msg); + kfree(payload); return rc; } From 95e3ee2e369592d92a2d67da82b855907e4f9898 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 16 Oct 2013 11:22:44 +0100 Subject: [PATCH 56/94] drm: Prevent overwriting from userspace underallocating core ioctl structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b062672e305ce071f21eb9e18b102c2a430e0999 upstream. Apply the protections from commit 1b2f1489633888d4a06028315dc19d65768a1c05 Author: Dave Airlie Date: Sat Aug 14 20:20:34 2010 +1000 drm: block userspace under allocating buffer and having drivers overwrite it (v2) to the core ioctl structs as well, for we found one instance where there is a 32-/64-bit size mismatch and were guilty of writing beyond the end of the user's buffer. Signed-off-by: Chris Wilson Cc: Dave Airlie Reviewed-by: Ville Syrjälä Cc: dri-devel@lists.freedesktop.org Signed-off-by: Dave Airlie Signed-off-by: Kamal Mostafa --- drivers/gpu/drm/drm_drv.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index be174cab105a..41777809bc28 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -411,9 +411,16 @@ long drm_ioctl(struct file *filp, asize = drv_size; } else if ((nr >= DRM_COMMAND_END) || (nr < DRM_COMMAND_BASE)) { + u32 drv_size; + ioctl = &drm_ioctls[nr]; - cmd = ioctl->cmd; + + drv_size = _IOC_SIZE(ioctl->cmd); usize = asize = _IOC_SIZE(cmd); + if (drv_size > asize) + asize = drv_size; + + cmd = ioctl->cmd; } else goto err_i1; From 2deab1c555d964b0269d6f603401fd78d9fd8df8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 16 Oct 2013 09:49:02 +0100 Subject: [PATCH 57/94] drm: Pad drm_mode_get_connector to 64-bit boundary commit bc5bd37ce48c66e9192ad2e7231e9678880f6f8e upstream. Pavel Roskin reported that DRM_IOCTL_MODE_GETCONNECTOR was overwritting the 4 bytes beyond the end of its structure with a 32-bit userspace running on a 64-bit kernel. This is due to the padding gcc inserts as the drm_mode_get_connector struct includes a u64 and its size is not a natural multiple of u64s. 64-bit kernel: sizeof(drm_mode_get_connector)=80, alignof=8 sizeof(drm_mode_get_encoder)=20, alignof=4 sizeof(drm_mode_modeinfo)=68, alignof=4 32-bit userspace: sizeof(drm_mode_get_connector)=76, alignof=4 sizeof(drm_mode_get_encoder)=20, alignof=4 sizeof(drm_mode_modeinfo)=68, alignof=4 Fortuituously we can insert explicit padding to the tail of our structures without breaking ABI. Reported-by: Pavel Roskin Signed-off-by: Chris Wilson Cc: Dave Airlie Cc: dri-devel@lists.freedesktop.org Signed-off-by: Dave Airlie Signed-off-by: Kamal Mostafa --- include/uapi/drm/drm_mode.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index 3d6301b6ec16..f604a1aafbde 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -223,6 +223,8 @@ struct drm_mode_get_connector { __u32 connection; __u32 mm_width, mm_height; /**< HxW in millimeters */ __u32 subpixel; + + __u32 pad; }; #define DRM_MODE_PROP_PENDING (1<<0) From 3211bb3ea5444754b7281e342f613596fed7f374 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 10 Oct 2013 16:45:27 -0400 Subject: [PATCH 58/94] drm/radeon/atom: workaround vbios bug in transmitter table on rs780 commit c23632d4e57c0dd20bf50eca08fa0eb8ad3ff680 upstream. Some rs780 asics seem to be affected as well. See: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91f3a6aaf280294b07c05dfe606e6c27b7ba3c72 Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=60791 Signed-off-by: Alex Deucher Signed-off-by: Kamal Mostafa --- drivers/gpu/drm/radeon/atombios_encoders.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/atombios_encoders.c b/drivers/gpu/drm/radeon/atombios_encoders.c index 3b681de94fcd..e4f56aca9307 100644 --- a/drivers/gpu/drm/radeon/atombios_encoders.c +++ b/drivers/gpu/drm/radeon/atombios_encoders.c @@ -1641,7 +1641,7 @@ radeon_atom_encoder_dpms_dig(struct drm_encoder *encoder, int mode) * does the same thing and more. */ if ((rdev->family != CHIP_RV710) && (rdev->family != CHIP_RV730) && - (rdev->family != CHIP_RS880)) + (rdev->family != CHIP_RS780) && (rdev->family != CHIP_RS880)) atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE_OUTPUT, 0, 0); } if (ENCODER_MODE_IS_DP(atombios_get_encoder_mode(encoder)) && connector) { From 854eded51db42a759868c732449a5b55bb12b0ac Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 24 Oct 2013 12:55:17 +1100 Subject: [PATCH 59/94] md: Fix skipping recovery for read-only arrays. commit 61e4947c99c4494336254ec540c50186d186150b upstream. Since: commit 7ceb17e87bde79d285a8b988cfed9eaeebe60b86 md: Allow devices to be re-added to a read-only array. spares are activated on a read-only array. In case of raid1 and raid10 personalities it causes that not-in-sync devices are marked in-sync without checking if recovery has been finished. If a read-only array is degraded and one of its devices is not in-sync (because the array has been only partially recovered) recovery will be skipped. This patch adds checking if recovery has been finished before marking a device in-sync for raid1 and raid10 personalities. In case of raid5 personality such condition is already present (at raid5.c:6029). Bug was introduced in 3.10 and causes data corruption. Signed-off-by: Pawel Baldysiak Signed-off-by: Lukasz Dorau Signed-off-by: NeilBrown Signed-off-by: Kamal Mostafa --- drivers/md/raid1.c | 1 + drivers/md/raid10.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a382d84de720..21f0c3f7ab77 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1493,6 +1493,7 @@ static int raid1_spare_active(struct mddev *mddev) } } if (rdev + && rdev->recovery_offset == MaxSector && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { count++; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 92858ab53e02..b8f5688cc7d9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1718,6 +1718,7 @@ static int raid10_spare_active(struct mddev *mddev) } sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); } else if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; From 9035f5798bf189e9cebe6eb21c4a33afe6180096 Mon Sep 17 00:00:00 2001 From: Bian Yu Date: Sat, 12 Oct 2013 01:10:03 -0400 Subject: [PATCH 60/94] md: avoid deadlock when md_set_badblocks. commit 905b0297a9533d7a6ee00a01a990456636877dd6 upstream. When operate harddisk and hit errors, md_set_badblocks is called after scsi_restart_operations which already disabled the irq. but md_set_badblocks will call write_sequnlock_irq and enable irq. so softirq can preempt the current thread and that may cause a deadlock. I think this situation should use write_sequnlock_irqsave/irqrestore instead. I met the situation and the call trace is below: [ 638.919974] BUG: spinlock recursion on CPU#0, scsi_eh_13/1010 [ 638.921923] lock: 0xffff8800d4d51fc8, .magic: dead4ead, .owner: scsi_eh_13/1010, .owner_cpu: 0 [ 638.923890] CPU: 0 PID: 1010 Comm: scsi_eh_13 Not tainted 3.12.0-rc5+ #37 [ 638.925844] Hardware name: To be filled by O.E.M. To be filled by O.E.M./MAHOBAY, BIOS 4.6.5 03/05/2013 [ 638.927816] ffff880037ad4640 ffff880118c03d50 ffffffff8172ff85 0000000000000007 [ 638.929829] ffff8800d4d51fc8 ffff880118c03d70 ffffffff81730030 ffff8800d4d51fc8 [ 638.931848] ffffffff81a72eb0 ffff880118c03d90 ffffffff81730056 ffff8800d4d51fc8 [ 638.933884] Call Trace: [ 638.935867] [] dump_stack+0x55/0x76 [ 638.937878] [] spin_dump+0x8a/0x8f [ 638.939861] [] spin_bug+0x21/0x26 [ 638.941836] [] do_raw_spin_lock+0xa4/0xc0 [ 638.943801] [] _raw_spin_lock+0x66/0x80 [ 638.945747] [] ? scsi_device_unbusy+0x9d/0xd0 [ 638.947672] [] ? _raw_spin_unlock+0x2b/0x50 [ 638.949595] [] scsi_device_unbusy+0x9d/0xd0 [ 638.951504] [] scsi_finish_command+0x37/0xe0 [ 638.953388] [] scsi_softirq_done+0xa8/0x140 [ 638.955248] [] blk_done_softirq+0x7b/0x90 [ 638.957116] [] __do_softirq+0xfd/0x330 [ 638.958987] [] ? __lock_release+0x6f/0x100 [ 638.960861] [] call_softirq+0x1c/0x30 [ 638.962724] [] do_softirq+0x8d/0xc0 [ 638.964565] [] irq_exit+0x10e/0x150 [ 638.966390] [] smp_apic_timer_interrupt+0x4a/0x60 [ 638.968223] [] apic_timer_interrupt+0x6f/0x80 [ 638.970079] [] ? __lock_release+0x6f/0x100 [ 638.971899] [] ? _raw_spin_unlock_irq+0x3a/0x50 [ 638.973691] [] ? _raw_spin_unlock_irq+0x30/0x50 [ 638.975475] [] md_set_badblocks+0x1f3/0x4a0 [ 638.977243] [] rdev_set_badblocks+0x27/0x80 [ 638.978988] [] raid5_end_read_request+0x36b/0x4e0 [raid456] [ 638.980723] [] bio_endio+0x1d/0x40 [ 638.982463] [] req_bio_endio.isra.65+0x83/0xa0 [ 638.984214] [] blk_update_request+0x7f/0x350 [ 638.985967] [] blk_update_bidi_request+0x31/0x90 [ 638.987710] [] __blk_end_bidi_request+0x20/0x50 [ 638.989439] [] __blk_end_request_all+0x1f/0x30 [ 638.991149] [] blk_peek_request+0x106/0x250 [ 638.992861] [] ? scsi_kill_request.isra.32+0xe9/0x130 [ 638.994561] [] scsi_request_fn+0x4a/0x3d0 [ 638.996251] [] __blk_run_queue+0x37/0x50 [ 638.997900] [] blk_run_queue+0x2f/0x50 [ 638.999553] [] scsi_run_queue+0xe0/0x1c0 [ 639.001185] [] scsi_run_host_queues+0x21/0x40 [ 639.002798] [] scsi_restart_operations+0x177/0x200 [ 639.004391] [] scsi_error_handler+0xc9/0xe0 [ 639.005996] [] ? scsi_unjam_host+0xd0/0xd0 [ 639.007600] [] kthread+0xdb/0xe0 [ 639.009205] [] ? flush_kthread_worker+0x170/0x170 [ 639.010821] [] ret_from_fork+0x7c/0xb0 [ 639.012437] [] ? flush_kthread_worker+0x170/0x170 This bug was introduce in commit 2e8ac30312973dd20e68073653 (the first time rdev_set_badblock was call from interrupt context), so this patch is appropriate for 3.5 and subsequent kernels. Signed-off-by: Bian Yu Reviewed-by: Jianpeng Ma Signed-off-by: NeilBrown Signed-off-by: Kamal Mostafa --- drivers/md/md.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0411bdef94a2..a240f16f0e4b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8044,6 +8044,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, u64 *p; int lo, hi; int rv = 1; + unsigned long flags; if (bb->shift < 0) /* badblocks are disabled */ @@ -8058,7 +8059,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, sectors = next - s; } - write_seqlock_irq(&bb->lock); + write_seqlock_irqsave(&bb->lock, flags); p = bb->page; lo = 0; @@ -8174,7 +8175,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, bb->changed = 1; if (!acknowledged) bb->unacked_exist = 1; - write_sequnlock_irq(&bb->lock); + write_sequnlock_irqrestore(&bb->lock, flags); return rv; } From 934175c03857bf49fbf341f846ad8a4e4b674bd6 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sat, 19 Oct 2013 14:50:28 +0800 Subject: [PATCH 61/94] raid5: set bio bi_vcnt 0 for discard request commit 37c61ff31e9b5e3fcf3cc6579f5c68f6ad40c4b1 upstream. SCSI layer will add new payload for discard request. If two bios are merged to one, the second bio has bi_vcnt 1 which is set in raid5. This will confuse SCSI and cause oops. Suitable for backport to 3.7+ Reported-by: Jes Sorensen Signed-off-by: Shaohua Li Signed-off-by: NeilBrown Acked-by: Martin K. Petersen [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- drivers/md/raid5.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5b022b1195a9..7ace380b74a6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -672,6 +672,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + bi->bi_vcnt = 0; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); @@ -706,6 +712,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; rbi->bi_next = NULL; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + rbi->bi_vcnt = 0; if (conf->mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), From 67cd3bc6429c9507356f39b9ac0c402b00998ac8 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sat, 19 Oct 2013 14:51:42 +0800 Subject: [PATCH 62/94] raid5: avoid finding "discard" stripe commit d47648fcf0611812286f68131b40251c6fa54f5e upstream. SCSI discard will damage discard stripe bio setting, eg, some fields are changed. If the stripe is reused very soon, we have wrong bios setting. We remove discard stripe from hash list, so next time the strip will be fully initialized. Suitable for backport to 3.7+. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown Signed-off-by: Kamal Mostafa --- drivers/md/raid5.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7ace380b74a6..51887b34b3ae 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2841,6 +2841,14 @@ static void handle_stripe_clean_event(struct r5conf *conf, } /* now that discard is done we can proceed with any sync */ clear_bit(STRIPE_DISCARD, &sh->state); + /* + * SCSI discard will change some bio fields and the stripe has + * no updated data, so remove it from hash list and the stripe + * will be reinitialized + */ + spin_lock_irq(&conf->device_lock); + remove_hash(sh); + spin_unlock_irq(&conf->device_lock); if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) set_bit(STRIPE_HANDLE, &sh->state); From c6155614bfa49a64780831b6ddfe6b087629b5ce Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 25 Oct 2013 21:53:33 +0800 Subject: [PATCH 63/94] target/pscsi: fix return value check commit 58932e96e438cd78f75e765d7b87ef39d3533d15 upstream. In case of error, the function scsi_host_lookup() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Signed-off-by: Wei Yongjun Signed-off-by: Nicholas Bellinger Signed-off-by: Kamal Mostafa --- drivers/target/target_core_pscsi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 55b9530609d8..5975a9454638 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -134,10 +134,10 @@ static int pscsi_pmode_enable_hba(struct se_hba *hba, unsigned long mode_flag) * pSCSI Host ID and enable for phba mode */ sh = scsi_host_lookup(phv->phv_host_id); - if (IS_ERR(sh)) { + if (!sh) { pr_err("pSCSI: Unable to locate SCSI Host for" " phv_host_id: %d\n", phv->phv_host_id); - return PTR_ERR(sh); + return -EINVAL; } phv->phv_lld_host = sh; @@ -515,10 +515,10 @@ static int pscsi_configure_device(struct se_device *dev) sh = phv->phv_lld_host; } else { sh = scsi_host_lookup(pdv->pdv_host_id); - if (IS_ERR(sh)) { + if (!sh) { pr_err("pSCSI: Unable to locate" " pdv_host_id: %d\n", pdv->pdv_host_id); - return PTR_ERR(sh); + return -EINVAL; } } } else { From 4f525c310aa9c3db198cd7fc4f553c301093c2db Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 25 Oct 2013 10:44:15 -0700 Subject: [PATCH 64/94] vhost/scsi: Fix incorrect usage of get_user_pages_fast write parameter commit 60a01f558af9c48b0bb31f303c479e32721add3f upstream. This patch addresses a long-standing bug where the get_user_pages_fast() write parameter used for setting the underlying page table entry permission bits was incorrectly set to write=1 for data_direction=DMA_TO_DEVICE, and passed into get_user_pages_fast() via vhost_scsi_map_iov_to_sgl(). However, this parameter is intended to signal WRITEs to pinned userspace PTEs for the virtio-scsi DMA_FROM_DEVICE -> READ payload case, and *not* for the virtio-scsi DMA_TO_DEVICE -> WRITE payload case. This bug would manifest itself as random process segmentation faults on KVM host after repeated vhost starts + stops and/or with lots of vhost endpoints + LUNs. Cc: Stefan Hajnoczi Cc: Michael S. Tsirkin Cc: Asias He Signed-off-by: Nicholas Bellinger [ kamal: backport to 3.8 (applied to tcm_vhost.c) ] Signed-off-by: Kamal Mostafa --- drivers/vhost/tcm_vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 22321cf84fbe..9412440468b3 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -696,7 +696,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) if (data_direction != DMA_NONE) { ret = vhost_scsi_map_iov_to_sgl(tv_cmd, &vq->iov[data_first], data_num, - data_direction == DMA_TO_DEVICE); + data_direction == DMA_FROM_DEVICE); if (unlikely(ret)) { vq_err(vq, "Failed to map iov to sgl\n"); break; /* TODO */ From 2b0675c2bfd5068c1dd19b27dffbeb1876a5cab0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 26 Oct 2013 23:19:25 +0200 Subject: [PATCH 65/94] parisc: Do not crash 64bit SMP kernels on machines with >= 4GB RAM commit 54e181e073fc1415e41917d725ebdbd7de956455 upstream. Since the beginning of the parisc-linux port, sometimes 64bit SMP kernels were not able to bring up other CPUs than the monarch CPU and instead crashed the kernel. The reason was unclear, esp. since it involved various machines (e.g. J5600, J6750 and SuperDome). Testing showed, that those crashes didn't happened when less than 4GB were installed, or if a 32bit Linux kernel was booted. In the end, the fix for those SMP problems is trivial: During the early phase of the initialization of the CPUs, including the monarch CPU, the PDC_PSW firmware function to enable WIDE (=64bit) mode is called. It's documented that this firmware function may clobber various registers, and one one of those possibly clobbered registers is %cr30 which holds the task thread info pointer. Now, if %cr30 would always have been clobbered, then this bug would have been detected much earlier. But lots of testing finally showed, that - at least for %cr30 - on some machines only the upper 32bits of the 64bit register suddenly turned zero after the firmware call. So, after finding the root cause, the explanation for the various crashes became clear: - On 32bit SMP Linux kernels all upper 32bit were zero, so we didn't faced this problem. - Monarch CPUs in 64bit mode always booted sucessfully, because the inital task thread info pointer was below 4GB. - Secondary CPUs booted sucessfully on machines with less than 4GB RAM because the upper 32bit were zero anyay. - Secondary CPus failed to boot if we had more than 4GB RAM and the task thread info pointer was located above the 4GB boundary. Finally, the patch to fix this problem is trivial by saving the %cr30 register before the firmware call and restoring it afterwards. Signed-off-by: Helge Deller Signed-off-by: John David Anglin Signed-off-by: Helge Deller Signed-off-by: Kamal Mostafa --- arch/parisc/kernel/head.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S index 37aabd772fbb..d2d58258aea6 100644 --- a/arch/parisc/kernel/head.S +++ b/arch/parisc/kernel/head.S @@ -195,6 +195,8 @@ common_stext: ldw MEM_PDC_HI(%r0),%r6 depd %r6, 31, 32, %r3 /* move to upper word */ + mfctl %cr30,%r6 /* PCX-W2 firmware bug */ + ldo PDC_PSW(%r0),%arg0 /* 21 */ ldo PDC_PSW_SET_DEFAULTS(%r0),%arg1 /* 2 */ ldo PDC_PSW_WIDE_BIT(%r0),%arg2 /* 2 */ @@ -203,6 +205,8 @@ common_stext: copy %r0,%arg3 stext_pdc_ret: + mtctl %r6,%cr30 /* restore task thread info */ + /* restore rfi target address*/ ldd TI_TASK-THREAD_SZ_ALGN(%sp), %r10 tophys_r1 %r10 From 05076a9ca6e201544af98b66083535007da8200c Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 3 Jul 2013 15:05:02 -0700 Subject: [PATCH 66/94] dmi: add support for exact DMI matches in addition to substring matching commit 5017b2851373ee15c7035151853bb1448800cae2 upstream. dmi_match() considers a substring match to be a successful match. This is not always sufficient to distinguish between DMI data for different systems. Add support for exact string matching using strcmp() in addition to the substring matching using strstr(). The specific use case in the i915 driver is to allow us to use an exact match for D510MO, without also incorrectly matching D510MOV: { .ident = "Intel D510MO", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Intel"), DMI_EXACT_MATCH(DMI_BOARD_NAME, "D510MO"), }, } Signed-off-by: Jani Nikula Cc: Cc: Chris Wilson Cc: Cornel Panceac Acked-by: Daniel Vetter Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [ kamal: 3.8-stable prereq for various commits ] Signed-off-by: Kamal Mostafa --- drivers/firmware/dmi_scan.c | 12 +++++++++--- include/linux/mod_devicetable.h | 6 ++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 4cd392dbf115..2861ef4570fc 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -534,9 +534,15 @@ static bool dmi_matches(const struct dmi_system_id *dmi) int s = dmi->matches[i].slot; if (s == DMI_NONE) break; - if (dmi_ident[s] - && strstr(dmi_ident[s], dmi->matches[i].substr)) - continue; + if (dmi_ident[s]) { + if (!dmi->matches[i].exact_match && + strstr(dmi_ident[s], dmi->matches[i].substr)) + continue; + else if (dmi->matches[i].exact_match && + !strcmp(dmi_ident[s], dmi->matches[i].substr)) + continue; + } + /* No match */ return false; } diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index fed3def62818..163eb82b6c5d 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -480,7 +480,8 @@ enum dmi_field { }; struct dmi_strmatch { - unsigned char slot; + unsigned char slot:7; + unsigned char exact_match:1; char substr[79]; }; @@ -508,7 +509,8 @@ struct dmi_system_id { #define dmi_device_id dmi_system_id #endif -#define DMI_MATCH(a, b) { a, b } +#define DMI_MATCH(a, b) { .slot = a, .substr = b } +#define DMI_EXACT_MATCH(a, b) { .slot = a, .substr = b, .exact_match = 1 } #define PLATFORM_NAME_SIZE 20 #define PLATFORM_MODULE_PREFIX "platform:" From 789d9df2dd3ab411da43a516b5a4dc68b372e709 Mon Sep 17 00:00:00 2001 From: Jonathan Austin Date: Tue, 23 Jul 2013 16:42:18 +0100 Subject: [PATCH 67/94] clk: fixup argument order when setting VCO parameters commit 2f9f64bc5aa31836810cd25301aa4772ad73ebab upstream. The order of arguments in the call to vco_set() for the ICST clocks appears to have been switched in error, which results in the VCO not being initialised correctly. This in turn stops the integrated LCD on things like Integrator/CP from working correctly. This patch fixes the order and restores the expected functionality. Reviewed-by: Linus Walleij Signed-off-by: Jonathan Austin Signed-off-by: Mike Turquette Signed-off-by: Kamal Mostafa --- drivers/clk/versatile/clk-icst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/versatile/clk-icst.c b/drivers/clk/versatile/clk-icst.c index 67ccf4aa7277..f5e4c21b301f 100644 --- a/drivers/clk/versatile/clk-icst.c +++ b/drivers/clk/versatile/clk-icst.c @@ -107,7 +107,7 @@ static int icst_set_rate(struct clk_hw *hw, unsigned long rate, vco = icst_hz_to_vco(icst->params, rate); icst->rate = icst_hz(icst->params, vco); - vco_set(icst->vcoreg, icst->lockreg, vco); + vco_set(icst->lockreg, icst->vcoreg, vco); return 0; } From 8201fd95dc667118d37aa9ccf3ec8e5a788eebb4 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 15 Oct 2013 02:22:43 +0400 Subject: [PATCH 68/94] xtensa: don't use alternate signal stack on threads commit cba9a90053e3b7973eff4f1946f33032e98eeed5 upstream. According to create_thread(3): "The new thread does not inherit the creating thread's alternate signal stack". Since commit f9a3879a (Fix sigaltstack corruption among cloned threads), current->sas_ss_size is set to 0 for cloned processes sharing VM with their parent. Don't use the (nonexistent) alternate signal stack in this case. This has been broken since commit 29c4dfd9 ([XTENSA] Remove non-rt signal handling). Fixes the SA_ONSTACK part of the nptl/tst-cancel20 test from uClibc. Signed-off-by: Baruch Siach Signed-off-by: Max Filippov Signed-off-by: Chris Zankel Signed-off-by: Kamal Mostafa --- arch/xtensa/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index de34d6be91cd..3f4b16093ae0 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -341,7 +341,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sp = regs->areg[1]; - if ((ka->sa.sa_flags & SA_ONSTACK) != 0 && ! on_sig_stack(sp)) { + if ((ka->sa.sa_flags & SA_ONSTACK) != 0 && sas_ss_flags(sp) == 0) { sp = current->sas_ss_sp + current->sas_ss_size; } From 546f463f55d8b4b4084a7defe34e09d76479932a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 24 Oct 2013 01:20:24 +0200 Subject: [PATCH 69/94] ALSA: hda - Fix unbalanced runtime PM refcount after S3/S4 commit e6bbe666673ab044a3d39ddb74e4d9a401cf1d6f upstream. When a machine goes to S3/S4 after power-save is enabled, the runtime PM refcount might be incorrectly decreased because the power-down triggered soon after resume assumes that the controller was already powered up, and issues the pm_notify down. This patch fixes the incorrect pm_notify call simply by checking the current value properly. Signed-off-by: Takashi Iwai Signed-off-by: Kamal Mostafa --- sound/pci/hda/hda_codec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index f283157e60d3..d048204b9852 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -4514,8 +4514,8 @@ static void hda_power_work(struct work_struct *work) spin_unlock(&codec->power_lock); state = hda_call_codec_suspend(codec, true); - codec->pm_down_notified = 0; - if (!bus->power_keep_link_on && (state & AC_PWRST_CLK_STOP_OK)) { + if (!codec->pm_down_notified && + !bus->power_keep_link_on && (state & AC_PWRST_CLK_STOP_OK)) { codec->pm_down_notified = 1; hda_call_pm_notify(bus, false); } From 959de86657caa41cda77143f76087acb32b4a753 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 28 Oct 2013 14:21:49 +0100 Subject: [PATCH 70/94] ASoC: dapm: Fix source list debugfs outputs commit ff18620c2157671a8ee21ebb8e6a3520ea209b1f upstream. ... due to a copy & paste error. Spotted by coverity CID 710923. Signed-off-by: Takashi Iwai Signed-off-by: Mark Brown Signed-off-by: Kamal Mostafa --- sound/soc/soc-dapm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 258acadb9e7d..f8495ef22a1e 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1766,7 +1766,7 @@ static ssize_t dapm_widget_power_read_file(struct file *file, w->active ? "active" : "inactive"); list_for_each_entry(p, &w->sources, list_sink) { - if (p->connected && !p->connected(w, p->sink)) + if (p->connected && !p->connected(w, p->source)) continue; if (p->connect) From b818e76978dda65f9c749970ebef14490eb93387 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 3 Jul 2013 15:05:03 -0700 Subject: [PATCH 71/94] drm/i915: quirk away phantom LVDS on Intel's D510MO mainboard commit e5614f0c2d0f4d7f0b8ef745d34593baf2c5dbf8 upstream. This replaceable mainboard only has a VGA-out, yet it claims to also have a connected LVDS header. Addresses https://bugs.freedesktop.org/show_bug.cgi?id=63860 [jani.nikula@intel.com: use DMI_EXACT_MATCH for board name.] Signed-off-by: Chris Wilson Signed-off-by: Jani Nikula Reported-by: Cc: Cornel Panceac Acked-by: Daniel Vetter Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Kamal Mostafa --- drivers/gpu/drm/i915/intel_lvds.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index 2f0364d4d70f..5cfd10eacf1a 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -797,6 +797,14 @@ static const struct dmi_system_id intel_no_lvds[] = { DMI_MATCH(DMI_PRODUCT_NAME, "ESPRIMO Q900"), }, }, + { + .callback = intel_no_lvds_dmi_callback, + .ident = "Intel D510MO", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Intel"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "D510MO"), + }, + }, { } /* terminating entry */ }; From 24498530d1a0246d7995baa386ee4fc0208911b2 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 3 Jul 2013 15:05:05 -0700 Subject: [PATCH 72/94] drm/i915: quirk away phantom LVDS on Intel's D525MW mainboard commit dcf6d294830d46b0e6901477fb4bf455281d90c8 upstream. This replaceable mainboard only has a VGA-out, yet it claims to also have a connected LVDS header. Addresses https://bugs.freedesktop.org/show_bug.cgi?id=65256 Signed-off-by: Jani Nikula Reported-by: Cornel Panceac Cc: Chris Wilson Cc: Acked-by: Daniel Vetter Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Kamal Mostafa --- drivers/gpu/drm/i915/intel_lvds.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index 5cfd10eacf1a..a4f9f6870638 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -805,6 +805,14 @@ static const struct dmi_system_id intel_no_lvds[] = { DMI_EXACT_MATCH(DMI_BOARD_NAME, "D510MO"), }, }, + { + .callback = intel_no_lvds_dmi_callback, + .ident = "Intel D525MW", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Intel"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "D525MW"), + }, + }, { } /* terminating entry */ }; From 4694d14ae2ffe12fa49f4b3e57d5c7a6b74c0856 Mon Sep 17 00:00:00 2001 From: Rob Pearce Date: Sun, 27 Oct 2013 16:13:42 +0000 Subject: [PATCH 73/94] drm/i915: No LVDS hardware on Intel D410PT and D425KT commit 645378d85ee524e429aa4cf52806047b56cdc596 upstream. The Intel D410PT(LW) and D425KT Mini-ITX desktop boards both show up as having LVDS but the hardware is not populated. This patch adds them to the list of such systems. Patch is against 3.11.4 v2: Patch revised to match the D425KT exactly as the D425KTW does have LVDS. According to Intel's documentation, the D410PTL and D410PLTW don't. Signed-off-by: Rob Pearce [danvet: Pimp commit message to my liking and add cc: stable.] Signed-off-by: Daniel Vetter Signed-off-by: Kamal Mostafa --- drivers/gpu/drm/i915/intel_lvds.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index a4f9f6870638..db4e9e61f811 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -797,6 +797,22 @@ static const struct dmi_system_id intel_no_lvds[] = { DMI_MATCH(DMI_PRODUCT_NAME, "ESPRIMO Q900"), }, }, + { + .callback = intel_no_lvds_dmi_callback, + .ident = "Intel D410PT", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Intel"), + DMI_MATCH(DMI_BOARD_NAME, "D410PT"), + }, + }, + { + .callback = intel_no_lvds_dmi_callback, + .ident = "Intel D425KT", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Intel"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "D425KT"), + }, + }, { .callback = intel_no_lvds_dmi_callback, .ident = "Intel D510MO", From 7d9e0ed099ab69726fefdcf00835d9c3dbac5747 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:43 +0100 Subject: [PATCH 74/94] mm: Wait for THP migrations to complete during NUMA hinting faults commit 42836f5f8baa33085f547098b74aa98991ee9216 upstream. The locking for migrating THP is unusual. While normal page migration prevents parallel accesses using a migration PTE, THP migration relies on a combination of the page_table_lock, the page lock and the existance of the NUMA hinting PTE to guarantee safety but there is a bug in the scheme. If a THP page is currently being migrated and another thread traps a fault on the same page it checks if the page is misplaced. If it is not, then pmd_numa is cleared. The problem is that it checks if the page is misplaced without holding the page lock meaning that the racing thread can be migrating the THP when the second thread clears the NUMA bit and faults a stale page. This patch checks if the page is potentially being migrated and stalls using the lock_page if it is potentially being migrated before checking if the page is misplaced or not. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-6-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar Signed-off-by: Kamal Mostafa --- mm/huge_memory.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a057a7d55ecd..f3868de62a6b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1315,13 +1315,14 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (current_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { - put_page(page); - goto clear_pmdnuma; - } + /* + * Acquire the page lock to serialise THP migrations but avoid dropping + * page_table_lock if at all possible + */ + if (trylock_page(page)) + goto got_lock; - /* Acquire the page lock to serialise THP migrations */ + /* Serialise against migrationa and check placement check placement */ spin_unlock(&mm->page_table_lock); lock_page(page); page_locked = true; @@ -1333,9 +1334,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); goto out_unlock; } - spin_unlock(&mm->page_table_lock); + +got_lock: + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + unlock_page(page); + put_page(page); + goto clear_pmdnuma; + } /* Migrate the THP to the requested node */ + spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); From 8d9ad392d9fd68b4f227032569a0aef9467d0952 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:34:33 -0800 Subject: [PATCH 75/94] mm: numa: cleanup flow of transhuge page migration commit 340ef3902cf20cec43cdcd1e72ae5cb518be7328 upstream. When correcting commit 04fa5d6a6547 ("mm: migrate: check page_count of THP before migrating") Hugh Dickins noted that the control flow for transhuge migration was difficult to follow. Unconditionally calling put_page() in numamigrate_isolate_page() made the failure paths of both migrate_misplaced_transhuge_page() and migrate_misplaced_page() more complex that they should be. Further, he was extremely wary that an unlock_page() should ever happen after a put_page() even if the put_page() should never be the final put_page. Hugh implemented the following cleanup to simplify the path by calling putback_lru_page() inside numamigrate_isolate_page() if it failed to isolate and always calling unlock_page() within migrate_misplaced_transhuge_page(). There is no functional change after this patch is applied but the code is easier to follow and unlock_page() always happens before put_page(). [mgorman@suse.de: changelog only] Signed-off-by: Mel Gorman Signed-off-by: Hugh Dickins Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Ingo Molnar Cc: Simon Jeons Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [ kamal: 3.8-stable prereq for various ] Signed-off-by: Kamal Mostafa --- mm/huge_memory.c | 28 +++++--------- mm/migrate.c | 99 ++++++++++++++++++++++-------------------------- 2 files changed, 54 insertions(+), 73 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f3868de62a6b..e19c20934cae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1302,7 +1302,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid; int current_nid = -1; bool migrated; - bool page_locked = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1325,7 +1324,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Serialise against migrationa and check placement check placement */ spin_unlock(&mm->page_table_lock); lock_page(page); - page_locked = true; /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); @@ -1346,34 +1344,26 @@ got_lock: /* Migrate the THP to the requested node */ spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, - pmdp, pmd, addr, - page, target_nid); - if (migrated) - current_nid = target_nid; - else { - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) { - unlock_page(page); - goto out_unlock; - } - goto clear_pmdnuma; - } + pmdp, pmd, addr, page, target_nid); + if (!migrated) + goto check_same; - task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(target_nid, HPAGE_PMD_NR, true); return 0; +check_same: + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); - if (page_locked) - unlock_page(page); - out_unlock: spin_unlock(&mm->page_table_lock); if (current_nid != -1) - task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(current_nid, HPAGE_PMD_NR, false); return 0; } diff --git a/mm/migrate.c b/mm/migrate.c index ba30b160a11a..811a2cab11fb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1570,39 +1570,38 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { - int ret = 0; + int page_lru; /* Avoid migrating to a node that is nearly full */ - if (migrate_balanced_pgdat(pgdat, 1)) { - int page_lru; + if (!migrate_balanced_pgdat(pgdat, 1)) + return 0; - if (isolate_lru_page(page)) { - put_page(page); - return 0; - } - - /* Page is isolated */ - ret = 1; - page_lru = page_is_file_cache(page); - if (!PageTransHuge(page)) - inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); - else - mod_zone_page_state(page_zone(page), - NR_ISOLATED_ANON + page_lru, - HPAGE_PMD_NR); - } + if (isolate_lru_page(page)) + return 0; /* - * Page is either isolated or there is not enough space on the target - * node. If isolated, then it has taken a reference count and the - * callers reference can be safely dropped without the page - * disappearing underneath us during migration. Otherwise the page is - * not to be migrated but the callers reference should still be - * dropped so it does not leak. + * migrate_misplaced_transhuge_page() skips page migration's usual + * check on page_count(), so we must do it here, now that the page + * has been isolated: a GUP pin, or any other pin, prevents migration. + * The expected page count is 3: 1 for page's mapcount and 1 for the + * caller's pin and 1 for the reference taken by isolate_lru_page(). + */ + if (PageTransHuge(page) && page_count(page) != 3) { + putback_lru_page(page); + return 0; + } + + page_lru = page_is_file_cache(page); + mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, + hpage_nr_pages(page)); + + /* + * Isolating the page has taken another reference, so the + * caller's reference can be safely dropped without the page + * disappearing underneath us during migration. */ put_page(page); - - return ret; + return 1; } /* @@ -1613,7 +1612,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) int migrate_misplaced_page(struct page *page, int node) { pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; + int isolated; int nr_remaining; LIST_HEAD(migratepages); @@ -1621,20 +1620,16 @@ int migrate_misplaced_page(struct page *page, int node) * Don't migrate pages that are mapped in multiple processes. * TODO: Handle false sharing detection instead of this hammer */ - if (page_mapcount(page) != 1) { - put_page(page); + if (page_mapcount(page) != 1) goto out; - } /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - if (numamigrate_update_ratelimit(pgdat, 1)) { - put_page(page); + if (numamigrate_update_ratelimit(pgdat, 1)) goto out; - } isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) @@ -1651,12 +1646,19 @@ int migrate_misplaced_page(struct page *page, int node) } else count_vm_numa_event(NUMA_PAGE_MIGRATE); BUG_ON(!list_empty(&migratepages)); -out: return isolated; + +out: + put_page(page); + return 0; } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +/* + * Migrates a THP to a given target node. page must be locked and is unlocked + * before returning. + */ int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry, @@ -1687,29 +1689,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, new_page = alloc_pages_node(node, (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); - if (!new_page) { - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - goto out_dropref; - } + if (!new_page) + goto out_fail; + page_xchg_last_nid(new_page, page_last_nid(page)); isolated = numamigrate_isolate_page(pgdat, page); - - /* - * Failing to isolate or a GUP pin prevents migration. The expected - * page count is 2. 1 for anonymous pages without a mapping and 1 - * for the callers pin. If the page was isolated, the page will - * need to be put back on the LRU. - */ - if (!isolated || page_count(page) != 2) { - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + if (!isolated) { put_page(new_page); - if (isolated) { - putback_lru_page(page); - isolated = 0; - goto out; - } - goto out_keep_locked; + goto out_fail; } /* Prepare a page as a migration target */ @@ -1741,6 +1729,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, putback_lru_page(page); count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + isolated = 0; goto out; } @@ -1785,9 +1774,11 @@ out: -HPAGE_PMD_NR); return isolated; +out_fail: + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + unlock_page(page); put_page(page); -out_keep_locked: return 0; } #endif /* CONFIG_NUMA_BALANCING */ From a0b61a5bcad2dc65f791df65cdf581cf680bf58f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:44 +0100 Subject: [PATCH 76/94] mm: Prevent parallel splits during THP migration commit 587fe586f44a48f9691001ba6c45b86c8e4ba21f upstream. THP migrations are serialised by the page lock but on its own that does not prevent THP splits. If the page is split during THP migration then the pmd_same checks will prevent page table corruption but the unlock page and other fix-ups potentially will cause corruption. This patch takes the anon_vma lock to prevent parallel splits during migration. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-7-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar Signed-off-by: Kamal Mostafa --- mm/huge_memory.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e19c20934cae..b698c7be3f71 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1297,18 +1297,18 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; int current_nid = -1; - bool migrated; + bool migrated, page_locked; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); - get_page(page); current_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (current_nid == numa_node_id()) @@ -1318,12 +1318,29 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Acquire the page lock to serialise THP migrations but avoid dropping * page_table_lock if at all possible */ - if (trylock_page(page)) - goto got_lock; + page_locked = trylock_page(page); + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + /* If the page was locked, there are no parallel migrations */ + if (page_locked) { + unlock_page(page); + goto clear_pmdnuma; + } - /* Serialise against migrationa and check placement check placement */ + /* Otherwise wait for potential migrations and retry fault */ + spin_unlock(&mm->page_table_lock); + wait_on_page_locked(page); + goto out; + } + + /* Page is misplaced, serialise migrations and parallel THP splits */ + get_page(page); spin_unlock(&mm->page_table_lock); - lock_page(page); + if (!page_locked) { + lock_page(page); + page_locked = true; + } + anon_vma = page_lock_anon_vma_read(page); /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); @@ -1333,14 +1350,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } -got_lock: - target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { - unlock_page(page); - put_page(page); - goto clear_pmdnuma; - } - /* Migrate the THP to the requested node */ spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, @@ -1349,6 +1358,8 @@ got_lock: goto check_same; task_numa_fault(target_nid, HPAGE_PMD_NR, true); + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); return 0; check_same: @@ -1362,6 +1373,11 @@ clear_pmdnuma: update_mmu_cache_pmd(vma, addr, pmdp); out_unlock: spin_unlock(&mm->page_table_lock); + +out: + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); + if (current_nid != -1) task_numa_fault(current_nid, HPAGE_PMD_NR, false); return 0; From b146128c397f57f1142e7667aed24a356ceb822b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:45 +0100 Subject: [PATCH 77/94] mm: numa: Sanitize task_numa_fault() callsites commit c61109e34f60f6e85bb43c5a1cd51c0e3db40847 upstream. There are three callers of task_numa_fault(): - do_huge_pmd_numa_page(): Accounts against the current node, not the node where the page resides, unless we migrated, in which case it accounts against the node we migrated to. - do_numa_page(): Accounts against the current node, not the node where the page resides, unless we migrated, in which case it accounts against the node we migrated to. - do_pmd_numa_page(): Accounts not at all when the page isn't migrated, otherwise accounts against the node we migrated towards. This seems wrong to me; all three sites should have the same sementaics, furthermore we should accounts against where the page really is, we already know where the task is. So modify all three sites to always account; we did after all receive the fault; and always account to where the page is after migration, regardless of success. They all still differ on when they clear the PTE/PMD; ideally that would get sorted too. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-8-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- mm/huge_memory.c | 23 +++++++++++---------- mm/memory.c | 53 +++++++++++++++++++----------------------------- 2 files changed, 33 insertions(+), 43 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b698c7be3f71..1390fdd5076f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1300,18 +1300,19 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; + int page_nid = -1, this_nid = numa_node_id(); int target_nid; - int current_nid = -1; - bool migrated, page_locked; + bool page_locked; + bool migrated = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); - current_nid = page_to_nid(page); + page_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); /* @@ -1354,13 +1355,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); - if (!migrated) + if (migrated) + page_nid = target_nid; + else goto check_same; - task_numa_fault(target_nid, HPAGE_PMD_NR, true); - if (anon_vma) - page_unlock_anon_vma_read(anon_vma); - return 0; + goto out; check_same: spin_lock(&mm->page_table_lock); @@ -1378,8 +1378,9 @@ out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); - if (current_nid != -1) - task_numa_fault(current_nid, HPAGE_PMD_NR, false); + if (page_nid != -1) + task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); + return 0; } diff --git a/mm/memory.c b/mm/memory.c index 35b6109f5284..5bcfbc1753aa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3485,12 +3485,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int current_nid) + unsigned long addr, int page_nid) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); return mpol_misplaced(page, vma, addr); @@ -3501,7 +3501,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page = NULL; spinlock_t *ptl; - int current_nid = -1; + int page_nid = -1; int target_nid; bool migrated = false; @@ -3531,15 +3531,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - current_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, current_nid); + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { - /* - * Account for the fault against the current node if it not - * being replaced regardless of where the page is located. - */ - current_nid = numa_node_id(); put_page(page); goto out; } @@ -3547,11 +3542,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ migrated = migrate_misplaced_page(page, target_nid); if (migrated) - current_nid = target_nid; + page_nid = target_nid; out: - if (current_nid != -1) - task_numa_fault(current_nid, 1, migrated); + if (page_nid != -1) + task_numa_fault(page_nid, 1, migrated); return 0; } @@ -3566,7 +3561,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; - int local_nid = numa_node_id(); spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3589,9 +3583,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { pte_t pteval = *pte; struct page *page; - int curr_nid = local_nid; + int page_nid = -1; int target_nid; - bool migrated; + bool migrated = false; + if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3613,25 +3608,19 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(page_mapcount(page) != 1)) continue; - /* - * Note that the NUMA fault is later accounted to either - * the node that is currently running or where the page is - * migrated to. - */ - curr_nid = local_nid; - target_nid = numa_migrate_prep(page, vma, addr, - page_to_nid(page)); - if (target_nid == -1) { + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid); + pte_unmap_unlock(pte, ptl); + if (target_nid != -1) { + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) + page_nid = target_nid; + } else { put_page(page); - continue; } - /* Migrate to the requested node */ - pte_unmap_unlock(pte, ptl); - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - curr_nid = target_nid; - task_numa_fault(curr_nid, 1, migrated); + if (page_nid != -1) + task_numa_fault(page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } From e68d262b71526fe99702cf0eebc928464c2f41aa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:46 +0100 Subject: [PATCH 78/94] mm: Close races between THP migration and PMD numa clearing commit 3f926ab945b60a5824369d21add7710622a2eac0 upstream. THP migration uses the page lock to guard against parallel allocations but there are cases like this still open Task A Task B --------------------- --------------------- do_huge_pmd_numa_page do_huge_pmd_numa_page lock_page mpol_misplaced == -1 unlock_page goto clear_pmdnuma lock_page mpol_misplaced == 2 migrate_misplaced_transhuge pmd = pmd_mknonnuma set_pmd_at During hours of testing, one crashed with weird errors and while I have no direct evidence, I suspect something like the race above happened. This patch extends the page lock to being held until the pmd_numa is cleared to prevent migration starting in parallel while the pmd_numa is being cleared. It also flushes the old pmd entry and orders pagetable insertion before rmap insertion. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- mm/huge_memory.c | 29 +++++++++++++++-------------- mm/migrate.c | 19 +++++++++++-------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1390fdd5076f..9c377763f41c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1323,24 +1323,25 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) { /* If the page was locked, there are no parallel migrations */ - if (page_locked) { - unlock_page(page); + if (page_locked) goto clear_pmdnuma; - } - /* Otherwise wait for potential migrations and retry fault */ + /* + * Otherwise wait for potential migrations and retry. We do + * relock and check_same as the page may no longer be mapped. + * As the fault is being retried, do not account for it. + */ spin_unlock(&mm->page_table_lock); wait_on_page_locked(page); + page_nid = -1; goto out; } /* Page is misplaced, serialise migrations and parallel THP splits */ get_page(page); spin_unlock(&mm->page_table_lock); - if (!page_locked) { + if (!page_locked) lock_page(page); - page_locked = true; - } anon_vma = page_lock_anon_vma_read(page); /* Confirm the PTE did not while locked */ @@ -1348,29 +1349,29 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); + page_nid = -1; goto out_unlock; } - /* Migrate the THP to the requested node */ + /* + * Migrate the THP to the requested node, returns with page unlocked + * and pmd_numa cleared. + */ spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); if (migrated) page_nid = target_nid; - else - goto check_same; goto out; -check_same: - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) - goto out_unlock; clear_pmdnuma: + BUG_ON(!PageLocked(page)); pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + unlock_page(page); out_unlock: spin_unlock(&mm->page_table_lock); diff --git a/mm/migrate.c b/mm/migrate.c index 811a2cab11fb..d2296c5b020b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1725,12 +1725,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unlock_page(new_page); put_page(new_page); /* Free it */ - unlock_page(page); + /* Retake the callers reference and putback on LRU */ + get_page(page); putback_lru_page(page); - - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - isolated = 0; - goto out; + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); + goto out_fail; } /* @@ -1747,9 +1747,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - page_add_new_anon_rmap(new_page, vma, haddr); - + pmdp_clear_flush(vma, haddr, pmd); set_pmd_at(mm, haddr, pmd, entry); + page_add_new_anon_rmap(new_page, vma, haddr); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(page); /* @@ -1768,7 +1768,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); -out: mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); @@ -1777,6 +1776,10 @@ out: out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + unlock_page(page); put_page(page); return 0; From 27b2d5c54874bc04c15ba6c1a59a643774cd6961 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:47 +0100 Subject: [PATCH 79/94] mm: Account for a THP NUMA hinting update as one PTE update commit 0255d491848032f6c601b6410c3b8ebded3a37b1 upstream. A THP PMD update is accounted for as 512 pages updated in vmstat. This is large difference when estimating the cost of automatic NUMA balancing and can be misleading when comparing results that had collapsed versus split THP. This patch addresses the accounting issue. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-10-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar Signed-off-by: Kamal Mostafa --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..2bbb648ea73f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -145,7 +145,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, split_huge_page_pmd(vma, addr, pmd); else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { - pages += HPAGE_PMD_NR; + pages++; continue; } /* fall through */ From 16e22f5ae6b7e6a84288aaef04c788f9f1bde83e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Oct 2013 10:21:34 -0700 Subject: [PATCH 80/94] Fix a few incorrectly checked [io_]remap_pfn_range() calls commit 7314e613d5ff9f0934f7a0f74ed7973b903315d1 upstream. Nico Golde reports a few straggling uses of [io_]remap_pfn_range() that really should use the vm_iomap_memory() helper. This trivially converts two of them to the helper, and comments about why the third one really needs to continue to use remap_pfn_range(), and adds the missing size check. Reported-by: Nico Golde Signed-off-by: Linus Torvalds --- drivers/uio/uio.c | 16 +++++++++++++++- drivers/video/au1100fb.c | 26 +------------------------- drivers/video/au1200fb.c | 25 +------------------------ 3 files changed, 17 insertions(+), 50 deletions(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 5110f367f1f1..96f13593bb53 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -650,14 +650,28 @@ static int uio_mmap_physical(struct vm_area_struct *vma) { struct uio_device *idev = vma->vm_private_data; int mi = uio_find_mem_index(vma); + struct uio_mem *mem; if (mi < 0) return -EINVAL; + mem = idev->info->mem + mi; + + if (vma->vm_end - vma->vm_start > mem->size) + return -EINVAL; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + /* + * We cannot use the vm_iomap_memory() helper here, + * because vma->vm_pgoff is the map index we looked + * up above in uio_find_mem_index(), rather than an + * actual page offset into the mmap. + * + * So we just do the physical mmap without a page + * offset. + */ return remap_pfn_range(vma, vma->vm_start, - idev->info->mem[mi].addr >> PAGE_SHIFT, + mem->addr >> PAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot); } diff --git a/drivers/video/au1100fb.c b/drivers/video/au1100fb.c index ddabaa867b0d..f8d8b97bbdb9 100644 --- a/drivers/video/au1100fb.c +++ b/drivers/video/au1100fb.c @@ -375,39 +375,15 @@ void au1100fb_fb_rotate(struct fb_info *fbi, int angle) int au1100fb_fb_mmap(struct fb_info *fbi, struct vm_area_struct *vma) { struct au1100fb_device *fbdev; - unsigned int len; - unsigned long start=0, off; fbdev = to_au1100fb_device(fbi); - if (vma->vm_pgoff > (~0UL >> PAGE_SHIFT)) { - return -EINVAL; - } - - start = fbdev->fb_phys & PAGE_MASK; - len = PAGE_ALIGN((start & ~PAGE_MASK) + fbdev->fb_len); - - off = vma->vm_pgoff << PAGE_SHIFT; - - if ((vma->vm_end - vma->vm_start + off) > len) { - return -EINVAL; - } - - off += start; - vma->vm_pgoff = off >> PAGE_SHIFT; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); pgprot_val(vma->vm_page_prot) |= (6 << 9); //CCA=6 vma->vm_flags |= VM_IO; - if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) { - return -EAGAIN; - } - - return 0; + return vm_iomap_memory(vma, fbdev->fb_phys, fbdev->fb_len); } static struct fb_ops au1100fb_ops = diff --git a/drivers/video/au1200fb.c b/drivers/video/au1200fb.c index 1b59054fc6a4..8f08ae750da5 100644 --- a/drivers/video/au1200fb.c +++ b/drivers/video/au1200fb.c @@ -1233,38 +1233,15 @@ static int au1200fb_fb_blank(int blank_mode, struct fb_info *fbi) * method mainly to allow the use of the TLB streaming flag (CCA=6) */ static int au1200fb_fb_mmap(struct fb_info *info, struct vm_area_struct *vma) - { - unsigned int len; - unsigned long start=0, off; struct au1200fb_device *fbdev = info->par; - if (vma->vm_pgoff > (~0UL >> PAGE_SHIFT)) { - return -EINVAL; - } - - start = fbdev->fb_phys & PAGE_MASK; - len = PAGE_ALIGN((start & ~PAGE_MASK) + fbdev->fb_len); - - off = vma->vm_pgoff << PAGE_SHIFT; - - if ((vma->vm_end - vma->vm_start + off) > len) { - return -EINVAL; - } - - off += start; - vma->vm_pgoff = off >> PAGE_SHIFT; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); pgprot_val(vma->vm_page_prot) |= _CACHE_MASK; /* CCA=7 */ vma->vm_flags |= VM_IO; - return io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot); - - return 0; + return vm_iomap_memory(vma, fbdev->fb_phys, fbdev->fb_len); } static void set_global(u_int cmd, struct au1200_lcd_global_regs_t *pdata) From 23b431250963bd1284e68606b874b74d052750d4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 30 Oct 2013 12:29:40 +0100 Subject: [PATCH 81/94] ALSA: hda - Add a fixup for ASUS N76VZ commit 6fc16e58adf50c0f1e4478538983fb5ff6f453d4 upstream. ASUS N76VZ needs the same fixup as N56VZ for supporting the boost speaker. Bugzilla: https://bugzilla.novell.com/show_bug.cgi?id=846529 Signed-off-by: Takashi Iwai Signed-off-by: Kamal Mostafa --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 73d3c8f2dbaa..d8c539bc2cc3 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6946,6 +6946,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x038b, "Acer Aspire 8943G", ALC662_FIXUP_ASPIRE), SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800), SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_ASUS_MODE4), + SND_PCI_QUIRK(0x1043, 0x1bf3, "ASUS N76VZ", ALC662_FIXUP_ASUS_MODE4), SND_PCI_QUIRK(0x1043, 0x8469, "ASUS mobo", ALC662_FIXUP_NO_JACK_DETECT), SND_PCI_QUIRK(0x105b, 0x0cd6, "Foxconn", ALC662_FIXUP_ASUS_MODE2), SND_PCI_QUIRK(0x144d, 0xc051, "Samsung R720", ALC662_FIXUP_IDEAPAD), From 4233e7e94f13bfc1b21d1e4fb54a34918d28f5b4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 30 Oct 2013 08:35:02 +0100 Subject: [PATCH 82/94] ASoC: wm_hubs: Add missing break in hp_supply_event() commit 268ff14525edba31da29a12a9dd693cdd6a7872e upstream. Spotted by coverity CID 115170. Signed-off-by: Takashi Iwai Signed-off-by: Mark Brown Signed-off-by: Kamal Mostafa --- sound/soc/codecs/wm_hubs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/wm_hubs.c b/sound/soc/codecs/wm_hubs.c index 867ae97ddcec..e7d0aee164cb 100644 --- a/sound/soc/codecs/wm_hubs.c +++ b/sound/soc/codecs/wm_hubs.c @@ -527,6 +527,7 @@ static int hp_supply_event(struct snd_soc_dapm_widget *w, hubs->hp_startup_mode); break; } + break; case SND_SOC_DAPM_PRE_PMD: snd_soc_update_bits(codec, WM8993_CHARGE_PUMP_1, From 364d11a0f36ed035a0adf09699da8d8b710a553c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 29 Oct 2013 22:06:04 +0300 Subject: [PATCH 83/94] uml: check length in exitcode_proc_write() commit 201f99f170df14ba52ea4c52847779042b7a623b upstream. We don't cap the size of buffer from the user so we could write past the end of the array here. Only root can write to this file. Reported-by: Nico Golde Reported-by: Fabian Yamaguchi Signed-off-by: Dan Carpenter Signed-off-by: Linus Torvalds Signed-off-by: Kamal Mostafa --- arch/um/kernel/exitcode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c index 829df49dee99..41ebbfebb333 100644 --- a/arch/um/kernel/exitcode.c +++ b/arch/um/kernel/exitcode.c @@ -40,9 +40,11 @@ static ssize_t exitcode_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { char *end, buf[sizeof("nnnnn\0")]; + size_t size; int tmp; - if (copy_from_user(buf, buffer, count)) + size = min(count, sizeof(buf)); + if (copy_from_user(buf, buffer, size)) return -EFAULT; tmp = simple_strtol(buf, &end, 0); From f9fc704132b2c8cffc319d0156a5db78e5c958de Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 29 Oct 2013 22:07:47 +0300 Subject: [PATCH 84/94] staging: ozwpan: prevent overflow in oz_cdev_write() commit c2c65cd2e14ada6de44cb527e7f1990bede24e15 upstream. We need to check "count" so we don't overflow the ei->data buffer. Reported-by: Nico Golde Reported-by: Fabian Yamaguchi Signed-off-by: Dan Carpenter Signed-off-by: Linus Torvalds Signed-off-by: Kamal Mostafa --- drivers/staging/ozwpan/ozcdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/staging/ozwpan/ozcdev.c b/drivers/staging/ozwpan/ozcdev.c index 64913aeb0bac..f9c493b84d1b 100644 --- a/drivers/staging/ozwpan/ozcdev.c +++ b/drivers/staging/ozwpan/ozcdev.c @@ -152,6 +152,9 @@ ssize_t oz_cdev_write(struct file *filp, const char __user *buf, size_t count, struct oz_app_hdr *app_hdr; struct oz_serial_ctx *ctx; + if (count > sizeof(ei->data) - sizeof(*elt) - sizeof(*app_hdr)) + return -EINVAL; + spin_lock_bh(&g_cdev.lock); pd = g_cdev.active_pd; if (pd) From 4813f022b03a14429c5c03fad92bffd0db67c5af Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 29 Oct 2013 22:11:06 +0300 Subject: [PATCH 85/94] aacraid: missing capable() check in compat ioctl commit f856567b930dfcdbc3323261bf77240ccdde01f5 upstream. In commit d496f94d22d1 ('[SCSI] aacraid: fix security weakness') we added a check on CAP_SYS_RAWIO to the ioctl. The compat ioctls need the check as well. Signed-off-by: Dan Carpenter Signed-off-by: Linus Torvalds Signed-off-by: Kamal Mostafa --- drivers/scsi/aacraid/linit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 408a42ef787a..f0d432c139d0 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -771,6 +771,8 @@ static long aac_compat_do_ioctl(struct aac_dev *dev, unsigned cmd, unsigned long static int aac_compat_ioctl(struct scsi_device *sdev, int cmd, void __user *arg) { struct aac_dev *dev = (struct aac_dev *)sdev->host->hostdata; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; return aac_compat_do_ioctl(dev, cmd, (unsigned long)arg); } From d14eefa448b86e6bef4de8641538e6e8d1c5d86c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 29 Oct 2013 23:00:15 +0300 Subject: [PATCH 86/94] staging: wlags49_h2: buffer overflow setting station name commit b5e2f339865fb443107e5b10603e53bbc92dc054 upstream. We need to check the length parameter before doing the memcpy(). I've actually changed it to strlcpy() as well so that it's NUL terminated. You need CAP_NET_ADMIN to trigger these so it's not the end of the world. Reported-by: Nico Golde Reported-by: Fabian Yamaguchi Signed-off-by: Dan Carpenter Signed-off-by: Linus Torvalds [ kamal: backport to 3.8 (context) ] Signed-off-by: Kamal Mostafa --- drivers/staging/wlags49_h2/wl_priv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/staging/wlags49_h2/wl_priv.c b/drivers/staging/wlags49_h2/wl_priv.c index 87e1e4123126..0978d9ba36b2 100644 --- a/drivers/staging/wlags49_h2/wl_priv.c +++ b/drivers/staging/wlags49_h2/wl_priv.c @@ -570,6 +570,7 @@ int wvlan_uil_put_info( struct uilreq *urq, struct wl_private *lp ) ltv_t *pLtv; bool_t ltvAllocated = FALSE; ENCSTRCT sEncryption; + size_t len; #ifdef USE_WDS hcf_16 hcfPort = HCF_PORT_0; @@ -686,7 +687,8 @@ int wvlan_uil_put_info( struct uilreq *urq, struct wl_private *lp ) break; case CFG_CNF_OWN_NAME: memset( lp->StationName, 0, sizeof( lp->StationName )); - memcpy( (void *)lp->StationName, (void *)&pLtv->u.u8[2], (size_t)pLtv->u.u16[0]); + len = min_t(size_t, pLtv->u.u16[0], sizeof(lp->StationName)); + strlcpy(lp->StationName, &pLtv->u.u8[2], len); pLtv->u.u16[0] = CNV_INT_TO_LITTLE( pLtv->u.u16[0] ); break; case CFG_CNF_LOAD_BALANCING: @@ -1800,6 +1802,7 @@ int wvlan_set_station_nickname(struct net_device *dev, { struct wl_private *lp = wl_priv(dev); unsigned long flags; + size_t len; int ret = 0; /*------------------------------------------------------------------------*/ @@ -1810,8 +1813,8 @@ int wvlan_set_station_nickname(struct net_device *dev, wl_lock(lp, &flags); memset( lp->StationName, 0, sizeof( lp->StationName )); - - memcpy( lp->StationName, extra, wrqu->data.length); + len = min_t(size_t, wrqu->data.length, sizeof(lp->StationName)); + strlcpy(lp->StationName, extra, len); /* Commit the adapter parameters */ wl_apply( lp ); From 92fa0f8b50435eae76fcf4a72e765c5f1eee58d3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 29 Oct 2013 23:01:11 +0300 Subject: [PATCH 87/94] Staging: bcm: info leak in ioctl commit 8d1e72250c847fa96498ec029891de4dc638a5ba upstream. The DevInfo.u32Reserved[] array isn't initialized so it leaks kernel information to user space. Reported-by: Nico Golde Reported-by: Fabian Yamaguchi Signed-off-by: Dan Carpenter Signed-off-by: Linus Torvalds Signed-off-by: Kamal Mostafa --- drivers/staging/bcm/Bcmchar.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/bcm/Bcmchar.c b/drivers/staging/bcm/Bcmchar.c index efad33e3ba73..7bb7689345b0 100644 --- a/drivers/staging/bcm/Bcmchar.c +++ b/drivers/staging/bcm/Bcmchar.c @@ -1960,6 +1960,7 @@ cntrlEnd: BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, OSAL_DBG, DBG_LVL_ALL, "Called IOCTL_BCM_GET_DEVICE_DRIVER_INFO\n"); + memset(&DevInfo, 0, sizeof(DevInfo)); DevInfo.MaxRDMBufferSize = BUFFER_4K; DevInfo.u32DSDStartOffset = EEPROM_CALPARAM_START; DevInfo.u32RxAlignmentCorrection = 0; From 03bc153ef464ab1458a7ea5c0b7c3c3272fb4bf8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 29 Oct 2013 23:01:43 +0300 Subject: [PATCH 88/94] Staging: sb105x: info leak in mp_get_count() commit a8b33654b1e3b0c74d4a1fed041c9aae50b3c427 upstream. The icount.reserved[] array isn't initialized so it leaks stack information to userspace. Reported-by: Nico Golde Reported-by: Fabian Yamaguchi Signed-off-by: Dan Carpenter Signed-off-by: Linus Torvalds Signed-off-by: Kamal Mostafa --- drivers/staging/sb105x/sb_pci_mp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/sb105x/sb_pci_mp.c b/drivers/staging/sb105x/sb_pci_mp.c index 9464f3874346..0f5e26232a46 100644 --- a/drivers/staging/sb105x/sb_pci_mp.c +++ b/drivers/staging/sb105x/sb_pci_mp.c @@ -1063,7 +1063,7 @@ static int mp_wait_modem_status(struct sb_uart_state *state, unsigned long arg) static int mp_get_count(struct sb_uart_state *state, struct serial_icounter_struct *icnt) { - struct serial_icounter_struct icount; + struct serial_icounter_struct icount = {}; struct sb_uart_icount cnow; struct sb_uart_port *port = state->port; From a09cbc1887c44152f56f09d6363ccf1ab4686ea6 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 31 Oct 2013 15:01:37 +0000 Subject: [PATCH 89/94] ALSA: fix oops in snd_pcm_info() caused by ASoC DPCM commit a4461f41b94cb52e0141af717dcf4ef6558c8e2e upstream. Unable to handle kernel NULL pointer dereference at virtual address 00000008 pgd = d5300000 [00000008] *pgd=0d265831, *pte=00000000, *ppte=00000000 Internal error: Oops: 17 [#1] PREEMPT ARM CPU: 0 PID: 2295 Comm: vlc Not tainted 3.11.0+ #755 task: dee74800 ti: e213c000 task.ti: e213c000 PC is at snd_pcm_info+0xc8/0xd8 LR is at 0x30232065 pc : [] lr : [<30232065>] psr: a0070013 sp : e213dea8 ip : d81cb0d0 fp : c05f7678 r10: c05f7770 r9 : fffffdfd r8 : 00000000 r7 : d8a968a8 r6 : d8a96800 r5 : d8a96200 r4 : d81cb000 r3 : 00000000 r2 : d81cb000 r1 : 00000001 r0 : d8a96200 Flags: NzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user Control: 10c5387d Table: 15300019 DAC: 00000015 Process vlc (pid: 2295, stack limit = 0xe213c248) [] (snd_pcm_info) from [] (snd_pcm_info_user+0x34/0x9c) [] (snd_pcm_info_user) from [] (snd_pcm_control_ioctl+0x274/0x280) [] (snd_pcm_control_ioctl) from [] (snd_ctl_ioctl+0xc0/0x55c) [] (snd_ctl_ioctl) from [] (do_vfs_ioctl+0x80/0x31c) [] (do_vfs_ioctl) from [] (SyS_ioctl+0x3c/0x60) [] (SyS_ioctl) from [] (ret_fast_syscall+0x0/0x48) Code: e1a00005 e59530dc e3a01001 e1a02004 (e5933008) ---[ end trace cb3d9bdb8dfefb3c ]--- This is provoked when the ASoC front end is open along with its backend, (which causes the backend to have a runtime assigned to it) and then the SNDRV_CTL_IOCTL_PCM_INFO is requested for the (visible) backend device. Resolve this by ensuring that ASoC internal backend devices are not visible to userspace, just as the commentry for snd_pcm_new_internal() says it should be. Signed-off-by: Russell King Acked-by: Mark Brown Signed-off-by: Takashi Iwai Signed-off-by: Kamal Mostafa --- sound/core/pcm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/core/pcm.c b/sound/core/pcm.c index 61798f85d030..2df12bdc9eb3 100644 --- a/sound/core/pcm.c +++ b/sound/core/pcm.c @@ -49,6 +49,8 @@ static struct snd_pcm *snd_pcm_get(struct snd_card *card, int device) struct snd_pcm *pcm; list_for_each_entry(pcm, &snd_pcm_devices, list) { + if (pcm->internal) + continue; if (pcm->card == card && pcm->device == device) return pcm; } @@ -60,6 +62,8 @@ static int snd_pcm_next(struct snd_card *card, int device) struct snd_pcm *pcm; list_for_each_entry(pcm, &snd_pcm_devices, list) { + if (pcm->internal) + continue; if (pcm->card == card && pcm->device > device) return pcm->device; else if (pcm->card->number > card->number) From 1081c94b7f6919e5b75ae07848b8cca3fff137b5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 31 Oct 2013 16:34:17 -0700 Subject: [PATCH 90/94] lib/scatterlist.c: don't flush_kernel_dcache_page on slab page commit 3d77b50c5874b7e923be946ba793644f82336b75 upstream. Commit b1adaf65ba03 ("[SCSI] block: add sg buffer copy helper functions") introduces two sg buffer copy helpers, and calls flush_kernel_dcache_page() on pages in SG list after these pages are written to. Unfortunately, the commit may introduce a potential bug: - Before sending some SCSI commands, kmalloc() buffer may be passed to block layper, so flush_kernel_dcache_page() can see a slab page finally - According to cachetlb.txt, flush_kernel_dcache_page() is only called on "a user page", which surely can't be a slab page. - ARCH's implementation of flush_kernel_dcache_page() may use page mapping information to do optimization so page_mapping() will see the slab page, then VM_BUG_ON() is triggered. Aaro Koskinen reported the bug on ARM/kirkwood when DEBUG_VM is enabled, and this patch fixes the bug by adding test of '!PageSlab(miter->page)' before calling flush_kernel_dcache_page(). Signed-off-by: Ming Lei Reported-by: Aaro Koskinen Tested-by: Simon Baatz Cc: Russell King - ARM Linux Cc: Will Deacon Cc: Aaro Koskinen Acked-by: Catalin Marinas Cc: FUJITA Tomonori Cc: Tejun Heo Cc: "James E.J. Bottomley" Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Kamal Mostafa --- lib/scatterlist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 7874b01e816e..bd86887b64cb 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -495,7 +495,8 @@ void sg_miter_stop(struct sg_mapping_iter *miter) if (miter->addr) { miter->__offset += miter->consumed; - if (miter->__flags & SG_MITER_TO_SG) + if ((miter->__flags & SG_MITER_TO_SG) && + !PageSlab(miter->page)) flush_kernel_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { From a167c775a75886a8b40531bbe796d67af0af4c0e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 2 Nov 2013 09:11:33 +1030 Subject: [PATCH 91/94] scripts/kallsyms: filter symbols not in kernel address space commit f6537f2f0eba4eba3354e48dbe3047db6d8b6254 upstream. This patch uses CONFIG_PAGE_OFFSET to filter symbols which are not in kernel address space because these symbols are generally for generating code purpose and can't be run at kernel mode, so we needn't keep them in /proc/kallsyms. For example, on ARM there are some symbols which may be linked in relocatable code section, then perf can't parse symbols any more from /proc/kallsyms, this patch fixes the problem (introduced b9b32bf70f2fb710b07c94e13afbc729afe221da) Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Cc: Michal Marek Signed-off-by: Ming Lei Signed-off-by: Rusty Russell Signed-off-by: Kamal Mostafa --- scripts/kallsyms.c | 12 +++++++++++- scripts/link-vmlinux.sh | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 487ac6f37ca2..9a11f9f799f4 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -55,6 +55,7 @@ static struct sym_entry *table; static unsigned int table_size, table_cnt; static int all_symbols = 0; static char symbol_prefix_char = '\0'; +static unsigned long long kernel_start_addr = 0; int token_profit[0x10000]; @@ -65,7 +66,10 @@ unsigned char best_table_len[256]; static void usage(void) { - fprintf(stderr, "Usage: kallsyms [--all-symbols] [--symbol-prefix=] < in.map > out.S\n"); + fprintf(stderr, "Usage: kallsyms [--all-symbols] " + "[--symbol-prefix=] " + "[--page-offset=] " + "< in.map > out.S\n"); exit(1); } @@ -194,6 +198,9 @@ static int symbol_valid(struct sym_entry *s) int i; int offset = 1; + if (s->addr < kernel_start_addr) + return 0; + /* skip prefix char */ if (symbol_prefix_char && *(s->sym + 1) == symbol_prefix_char) offset++; @@ -646,6 +653,9 @@ int main(int argc, char **argv) if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\'')) p++; symbol_prefix_char = *p; + } else if (strncmp(argv[i], "--page-offset=", 14) == 0) { + const char *p = &argv[i][14]; + kernel_start_addr = strtoull(p, NULL, 16); } else usage(); } diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index b3d907eb93a9..5716ba1b228f 100644 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -83,6 +83,8 @@ kallsyms() kallsymopt="${kallsymopt} --all-symbols" fi + kallsymopt="${kallsymopt} --page-offset=$CONFIG_PAGE_OFFSET" + local aflags="${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \ ${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS}" From 325e819ac15d25f6e49a7f8feb2f0466cf2ccf1e Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 26 Sep 2013 12:09:52 +0100 Subject: [PATCH 92/94] xen-netback: Handle backend state transitions in a more robust way commit ea732dff5cfa10789007bf4a5b935388a0bb2a8f upstream. When the frontend state changes netback now specifies its desired state to a new function, set_backend_state(), which transitions through any necessary intermediate states. This fixes an issue observed with some old Windows frontend drivers where they failed to transition through the Closing state and netback would not behave correctly. Signed-off-by: Paul Durrant Cc: Ian Campbell Cc: Wei Liu Cc: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller [ kamal: backport to 3.8 (context); prereq for dc62ccaccfb1 "xen-netback: transition to CLOSED when removing a VIF" ] Signed-off-by: Kamal Mostafa --- drivers/net/xen-netback/xenbus.c | 149 ++++++++++++++++++++++++------- 1 file changed, 118 insertions(+), 31 deletions(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index abe24ff000f0..805a0a328391 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -24,6 +24,12 @@ struct backend_info { struct xenbus_device *dev; struct xenvif *vif; + + /* This is the state that will be reflected in xenstore when any + * active hotplug script completes. + */ + enum xenbus_state state; + enum xenbus_state frontend_state; struct xenbus_watch hotplug_status_watch; u8 have_hotplug_status_watch:1; @@ -126,6 +132,8 @@ static int netback_probe(struct xenbus_device *dev, if (err) goto fail; + be->state = XenbusStateInitWait; + /* This kicks hotplug scripts, so do it immediately. */ backend_create_xenvif(be); @@ -198,24 +206,113 @@ static void backend_create_xenvif(struct backend_info *be) kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); } - -static void disconnect_backend(struct xenbus_device *dev) +static void backend_disconnect(struct backend_info *be) { - struct backend_info *be = dev_get_drvdata(&dev->dev); - if (be->vif) xenvif_disconnect(be->vif); } -static void destroy_backend(struct xenbus_device *dev) +static void backend_connect(struct backend_info *be) { - struct backend_info *be = dev_get_drvdata(&dev->dev); + if (be->vif) + connect(be); +} - if (be->vif) { - kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); - xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); - xenvif_free(be->vif); - be->vif = NULL; +static inline void backend_switch_state(struct backend_info *be, + enum xenbus_state state) +{ + struct xenbus_device *dev = be->dev; + + pr_debug("%s -> %s\n", dev->nodename, xenbus_strstate(state)); + be->state = state; + + /* If we are waiting for a hotplug script then defer the + * actual xenbus state change. + */ + if (!be->have_hotplug_status_watch) + xenbus_switch_state(dev, state); +} + +/* Handle backend state transitions: + * + * The backend state starts in InitWait and the following transitions are + * allowed. + * + * InitWait -> Connected + * + * ^ \ | + * | \ | + * | \ | + * | \ | + * | \ | + * | \ | + * | V V + * + * Closed <-> Closing + * + * The state argument specifies the eventual state of the backend and the + * function transitions to that state via the shortest path. + */ +static void set_backend_state(struct backend_info *be, + enum xenbus_state state) +{ + while (be->state != state) { + switch (be->state) { + case XenbusStateClosed: + switch (state) { + case XenbusStateInitWait: + case XenbusStateConnected: + pr_info("%s: prepare for reconnect\n", + be->dev->nodename); + backend_switch_state(be, XenbusStateInitWait); + break; + case XenbusStateClosing: + backend_switch_state(be, XenbusStateClosing); + break; + default: + BUG(); + } + break; + case XenbusStateInitWait: + switch (state) { + case XenbusStateConnected: + backend_connect(be); + backend_switch_state(be, XenbusStateConnected); + break; + case XenbusStateClosing: + case XenbusStateClosed: + backend_switch_state(be, XenbusStateClosing); + break; + default: + BUG(); + } + break; + case XenbusStateConnected: + switch (state) { + case XenbusStateInitWait: + case XenbusStateClosing: + case XenbusStateClosed: + backend_disconnect(be); + backend_switch_state(be, XenbusStateClosing); + break; + default: + BUG(); + } + break; + case XenbusStateClosing: + switch (state) { + case XenbusStateInitWait: + case XenbusStateConnected: + case XenbusStateClosed: + backend_switch_state(be, XenbusStateClosed); + break; + default: + BUG(); + } + break; + default: + BUG(); + } } } @@ -227,41 +324,33 @@ static void frontend_changed(struct xenbus_device *dev, { struct backend_info *be = dev_get_drvdata(&dev->dev); - pr_debug("frontend state %s", xenbus_strstate(frontend_state)); + pr_debug("%s -> %s\n", dev->otherend, xenbus_strstate(frontend_state)); be->frontend_state = frontend_state; switch (frontend_state) { case XenbusStateInitialising: - if (dev->state == XenbusStateClosed) { - printk(KERN_INFO "%s: %s: prepare for reconnect\n", - __func__, dev->nodename); - xenbus_switch_state(dev, XenbusStateInitWait); - } + set_backend_state(be, XenbusStateInitWait); break; case XenbusStateInitialised: break; case XenbusStateConnected: - if (dev->state == XenbusStateConnected) - break; - if (be->vif) - connect(be); + set_backend_state(be, XenbusStateConnected); break; case XenbusStateClosing: - disconnect_backend(dev); - xenbus_switch_state(dev, XenbusStateClosing); + set_backend_state(be, XenbusStateClosing); break; case XenbusStateClosed: - xenbus_switch_state(dev, XenbusStateClosed); + set_backend_state(be, XenbusStateClosed); if (xenbus_dev_is_online(dev)) break; - destroy_backend(dev); /* fall through if not online */ case XenbusStateUnknown: + set_backend_state(be, XenbusStateClosed); device_unregister(&dev->dev); break; @@ -354,7 +443,9 @@ static void hotplug_status_changed(struct xenbus_watch *watch, if (IS_ERR(str)) return; if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) { - xenbus_switch_state(be->dev, XenbusStateConnected); + /* Complete any pending state change */ + xenbus_switch_state(be->dev, be->state); + /* Not interested in this watch anymore. */ unregister_hotplug_status_watch(be); } @@ -384,12 +475,8 @@ static void connect(struct backend_info *be) err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, hotplug_status_changed, "%s/%s", dev->nodename, "hotplug-status"); - if (err) { - /* Switch now, since we can't do a watch. */ - xenbus_switch_state(dev, XenbusStateConnected); - } else { + if (!err) be->have_hotplug_status_watch = 1; - } netif_wake_queue(be->vif->dev); } From f9a8d8c3d224859031296736ee40b3dbfd3eff0c Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 7 Oct 2013 13:55:19 +0100 Subject: [PATCH 93/94] xen-netback: transition to CLOSED when removing a VIF commit dc62ccaccfb139d9b04bbc5a2688a4402adbfab3 upstream. If a guest is destroyed without transitioning its frontend to CLOSED, the domain becomes a zombie as netback was not grant unmapping the shared rings. When removing a VIF, transition the backend to CLOSED so the VIF is disconnected if necessary (which will unmap the shared rings etc). This fixes a regression introduced by 279f438e36c0a70b23b86d2090aeec50155034a9 (xen-netback: Don't destroy the netdev until the vif is shut down). Signed-off-by: David Vrabel Cc: Ian Campbell Cc: Wei Liu Cc: Paul Durrant Acked-by: Wei Liu Reviewed-by: Paul Durrant Signed-off-by: David S. Miller Signed-off-by: Kamal Mostafa --- drivers/net/xen-netback/xenbus.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 805a0a328391..8a9e8750703f 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -39,11 +39,15 @@ static int connect_rings(struct backend_info *); static void connect(struct backend_info *); static void backend_create_xenvif(struct backend_info *be); static void unregister_hotplug_status_watch(struct backend_info *be); +static void set_backend_state(struct backend_info *be, + enum xenbus_state state); static int netback_remove(struct xenbus_device *dev) { struct backend_info *be = dev_get_drvdata(&dev->dev); + set_backend_state(be, XenbusStateClosed); + unregister_hotplug_status_watch(be); if (be->vif) { kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); From 4d6939558a6dafc578d2412b01ecf00132bb9311 Mon Sep 17 00:00:00 2001 From: Kamal Mostafa Date: Wed, 13 Nov 2013 12:45:34 -0800 Subject: [PATCH 94/94] Linux 3.8.13.13 Signed-off-by: Kamal Mostafa --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index afde10172507..2fdc053b5321 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 8 SUBLEVEL = 13 -EXTRAVERSION = .12 +EXTRAVERSION = .13 NAME = Remoralised Urchins Update # *DOCUMENTATION*